From 2809fbf6515e33b8f2cd12fb6a4a81a1068441b9 Mon Sep 17 00:00:00 2001
From: benibus <bpharks@gmx.com>
Date: Wed, 14 Jun 2023 14:52:55 -0400
Subject: [PATCH] Implement column statistics

---
 cpp/src/parquet/float_internal.h   |  61 +++++
 cpp/src/parquet/statistics.cc      | 144 ++++++++++--
 cpp/src/parquet/statistics_test.cc | 343 +++++++++++++++++++++--------
 3 files changed, 442 insertions(+), 106 deletions(-)
 create mode 100644 cpp/src/parquet/float_internal.h
diff --git a/cpp/src/parquet/float_internal.h b/cpp/src/parquet/float_internal.h
new file mode 100644
index 0000000000000..c82c9d575ce3b
--- /dev/null
+++ b/cpp/src/parquet/float_internal.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+struct float16 {
+  constexpr static uint16_t min() { return 0b1111101111111111; }
+  constexpr static uint16_t max() { return 0b0111101111111111; }
+  constexpr static uint16_t positive_zero() { return 0b0000000000000000; }
+  constexpr static uint16_t negative_zero() { return 0b1000000000000000; }
+
+  static uint8_t* min_ptr() { return min_; }
+  static uint8_t* max_ptr() { return max_; }
+  static uint8_t* positive_zero_ptr() { return positive_zero_; }
+  static uint8_t* negative_zero_ptr() { return negative_zero_; }
+
+  static bool is_nan(uint16_t n) { return (n & 0x7c00) == 0x7c00 && (n & 0x03ff) != 0; }
+  static bool is_zero(uint16_t n) { return (n & 0x7fff) == 0; }
+  static bool signbit(uint16_t n) { return (n & 0x8000) != 0; }
+
+  static uint16_t Pack(const uint8_t* src) {
+    return ::arrow::bit_util::FromLittleEndian(::arrow::util::SafeLoadAs<uint16_t>(src));
+  }
+  static uint16_t Pack(const FLBA& src) { return Pack(src.ptr); }
+
+  static uint8_t* Unpack(uint16_t src, uint8_t* dest) {
+    src = ::arrow::bit_util::ToLittleEndian(src);
+    return static_cast<uint8_t*>(std::memcpy(dest, &src, sizeof(src)));
+  }
+
+ private:
+  static inline uint8_t min_[] = {0b11111111, 0b11111011};
+  static inline uint8_t max_[] = {0b11111111, 0b01111011};
+  static inline uint8_t positive_zero_[] = {0b00000000, 0b00000000};
+  static inline uint8_t negative_zero_[] = {0b00000000, 0b10000000};
+};
+
+}  // namespace parquet
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index aa6df7e32a98c..ad4e12e3e60f0 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -35,6 +35,7 @@
 #include "arrow/visit_data_inline.h"
 #include "parquet/encoding.h"
 #include "parquet/exception.h"
+#include "parquet/float_internal.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
 
@@ -277,11 +278,54 @@ template <bool is_signed>
 struct CompareHelper<FLBAType, is_signed>
     : public BinaryLikeCompareHelperBase<FLBAType, is_signed> {};
 
+struct Float16CompareHelper {
+  using T = FLBA;
+
+  static T DefaultMin() { return T{float16::max_ptr()}; }
+  static T DefaultMax() { return T{float16::min_ptr()}; }
+
+  static T Coalesce(T val, T fallback) {
+    return val.ptr != nullptr && float16::is_nan(float16::Pack(val)) ? fallback : val;
+  }
+
+  static inline bool Compare(int type_length, const T& a, const T& b) {
+    uint16_t l = float16::Pack(a);
+    uint16_t r = float16::Pack(b);
+
+    if (l & 0x8000) {
+      if (r & 0x8000) {
+        // Both are negative
+        return (l & 0x7fff) > (r & 0x7fff);
+      } else {
+        // Handle +/-0
+        return (l & 0x7fff) || r != 0;
+      }
+    } else if (r & 0x8000) {
+      return false;
+    } else {
+      // Both are positive
+      return (l & 0x7fff) < (r & 0x7fff);
+    }
+  }
+
+  static T Min(int type_length, const T& a, const T& b) {
+    if (a.ptr == nullptr) return b;
+    if (b.ptr == nullptr) return a;
+    return Compare(type_length, a, b) ? a : b;
+  }
+
+  static T Max(int type_length, const T& a, const T& b) {
+    if (a.ptr == nullptr) return b;
+    if (b.ptr == nullptr) return a;
+    return Compare(type_length, a, b) ? b : a;
+  }
+};
+
 using ::std::optional;
 
 template <typename T>
 ::arrow::enable_if_t<std::is_integral<T>::value, optional<std::pair<T, T>>>
-CleanStatistic(std::pair<T, T> min_max) {
+CleanStatistic(std::pair<T, T> min_max, LogicalType::Type::type) {
   return min_max;
 }
 
@@ -292,7 +336,7 @@ CleanStatistic(std::pair<T, T> min_max) {
 // - If max is -0.0f, replace with 0.0f
 template <typename T>
 ::arrow::enable_if_t<std::is_floating_point<T>::value, optional<std::pair<T, T>>>
-CleanStatistic(std::pair<T, T> min_max) {
+CleanStatistic(std::pair<T, T> min_max, LogicalType::Type::type) {
   T min = min_max.first;
   T max = min_max.second;
 
@@ -318,26 +362,55 @@ CleanStatistic(std::pair<T, T> min_max) {
   return {{min, max}};
 }
 
-optional<std::pair<FLBA, FLBA>> CleanStatistic(std::pair<FLBA, FLBA> min_max) {
+optional<std::pair<FLBA, FLBA>> CleanFloat16Statistic(std::pair<FLBA, FLBA> min_max) {
+  FLBA min = min_max.first;
+  FLBA max = min_max.second;
+  uint16_t min_packed = float16::Pack(min);
+  uint16_t max_packed = float16::Pack(max);
+
+  if (float16::is_nan(min_packed) || float16::is_nan(max_packed)) {
+    return ::std::nullopt;
+  }
+
+  if (min_packed == float16::max() && max_packed == float16::min()) {
+    return ::std::nullopt;
+  }
+
+  if (min_packed == float16::positive_zero()) {
+    min = FLBA{float16::negative_zero_ptr()};
+  }
+  if (max_packed == float16::negative_zero()) {
+    max = FLBA{float16::positive_zero_ptr()};
+  }
+
+  return {{min, max}};
+}
+
+optional<std::pair<FLBA, FLBA>> CleanStatistic(std::pair<FLBA, FLBA> min_max,
+                                               LogicalType::Type::type logical_type) {
   if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
     return ::std::nullopt;
   }
+  if (logical_type == LogicalType::Type::FLOAT16) {
+    return CleanFloat16Statistic(std::move(min_max));
+  }
   return min_max;
 }
 
 optional<std::pair<ByteArray, ByteArray>> CleanStatistic(
-    std::pair<ByteArray, ByteArray> min_max) {
+    std::pair<ByteArray, ByteArray> min_max, LogicalType::Type::type) {
   if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
     return ::std::nullopt;
   }
   return min_max;
 }
 
-template <bool is_signed, typename DType>
+template <bool is_signed, typename DType,
+          typename HelperType = CompareHelper<DType, is_signed>>
 class TypedComparatorImpl : virtual public TypedComparator<DType> {
  public:
   using T = typename DType::c_type;
-  using Helper = CompareHelper<DType, is_signed>;
+  using Helper = HelperType;
 
   explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {}
 
@@ -412,9 +485,9 @@ TypedComparatorImpl</*is_signed=*/false, Int32Type>::GetMinMax(const int32_t* va
   return {SafeCopy<int32_t>(min), SafeCopy<int32_t>(max)};
 }
 
-template <bool is_signed, typename DType>
+template <bool is_signed, typename DType, typename Helper>
 std::pair<typename DType::c_type, typename DType::c_type>
-TypedComparatorImpl<is_signed, DType>::GetMinMax(const ::arrow::Array& values) {
+TypedComparatorImpl<is_signed, DType, Helper>::GetMinMax(const ::arrow::Array& values) {
   ParquetException::NYI(values.type()->ToString());
 }
 
@@ -458,6 +531,16 @@ std::pair<ByteArray, ByteArray> TypedComparatorImpl<false, ByteArrayType>::GetMi
   return GetMinMaxBinaryHelper<false>(*this, values);
 }
 
+static LogicalType::Type::type LogicalTypeId(const ColumnDescriptor* descr) {
+  if (const auto& logical_type = descr->logical_type()) {
+    return logical_type->type();
+  }
+  return LogicalType::Type::NONE;
+}
+static LogicalType::Type::type LogicalTypeId(const Statistics& stats) {
+  return LogicalTypeId(stats.descr());
+}
+
 template <typename DType>
 class TypedStatisticsImpl : public TypedStatistics<DType> {
  public:
@@ -468,8 +551,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
         pool_(pool),
         min_buffer_(AllocateBuffer(pool_, 0)),
         max_buffer_(AllocateBuffer(pool_, 0)) {
-    auto comp = Comparator::Make(descr);
-    comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
+    comparator_ = MakeComparator<DType>(descr);
     TypedStatisticsImpl::Reset();
     has_null_count_ = true;
     has_distinct_count_ = true;
@@ -525,6 +607,19 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
   bool Equals(const Statistics& raw_other) const override {
     if (physical_type() != raw_other.physical_type()) return false;
 
+    const auto logical_id = LogicalTypeId(*this);
+    switch (logical_id) {
+      // Only compare against logical types that influence the interpretation of the
+      // physical type
+      case LogicalType::Type::FLOAT16:
+        if (LogicalTypeId(raw_other) != logical_id) {
+          return false;
+        }
+        break;
+      default:
+        break;
+    }
+
     const auto& other = checked_cast<const TypedStatisticsImpl&>(raw_other);
 
     if (has_min_max_ != other.has_min_max_) return false;
@@ -650,7 +745,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
 
   void SetMinMaxPair(std::pair<T, T> min_max) {
     // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN
-    auto maybe_min_max = CleanStatistic(min_max);
+    auto maybe_min_max = CleanStatistic(min_max, LogicalTypeId(*this));
     if (!maybe_min_max) return;
 
     auto min = maybe_min_max.value().first;
@@ -759,12 +854,8 @@ void TypedStatisticsImpl<ByteArrayType>::PlainDecode(const std::string& src,
   dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
 }
 
-}  // namespace
-
-// ----------------------------------------------------------------------
-// Public factory functions
-
-std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
+std::shared_ptr<Comparator> DoMakeComparator(Type::type physical_type,
+                                             LogicalType::Type::type logical_type,
                                              SortOrder::type sort_order,
                                              int type_length) {
   if (SortOrder::SIGNED == sort_order) {
@@ -784,6 +875,10 @@ std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
       case Type::BYTE_ARRAY:
         return std::make_shared<TypedComparatorImpl<true, ByteArrayType>>();
       case Type::FIXED_LEN_BYTE_ARRAY:
+        if (logical_type == LogicalType::Type::FLOAT16) {
+          return std::make_shared<
+              TypedComparatorImpl<true, FLBAType, Float16CompareHelper>>();
+        }
         return std::make_shared<TypedComparatorImpl<true, FLBAType>>(type_length);
       default:
         ParquetException::NYI("Signed Compare not implemented");
@@ -809,8 +904,21 @@ std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
   return nullptr;
 }
 
+}  // namespace
+
+// ----------------------------------------------------------------------
+// Public factory functions
+
+std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
+                                             SortOrder::type sort_order,
+                                             int type_length) {
+  return DoMakeComparator(physical_type, LogicalType::Type::NONE, sort_order,
+                          type_length);
+}
+
 std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) {
-  return Make(descr->physical_type(), descr->sort_order(), descr->type_length());
+  return DoMakeComparator(descr->physical_type(), LogicalTypeId(descr),
+                          descr->sort_order(), descr->type_length());
 }
 
 std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc
index 8b9a42aa18bba..dcb6fda9209d6 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -40,6 +40,7 @@
 #include "parquet/column_writer.h"
 #include "parquet/file_reader.h"
 #include "parquet/file_writer.h"
+#include "parquet/float_internal.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
 #include "parquet/statistics.h"
@@ -680,9 +681,22 @@ TEST(CorrectStatistics, Basics) {
 // Test SortOrder class
 static const int NUM_VALUES = 10;
 
-template <typename TestType>
+template <typename T>
+struct RebindLogical {
+  using ParquetType = T;
+  using CType = typename T::c_type;
+};
+
+template <>
+struct RebindLogical<Float16LogicalType> {
+  using ParquetType = FLBAType;
+  using CType = ParquetType::c_type;
+};
+
+template <typename T>
 class TestStatisticsSortOrder : public ::testing::Test {
  public:
+  using TestType = typename RebindLogical<T>::ParquetType;
   using c_type = typename TestType::c_type;
 
   void SetUp() override {
@@ -760,7 +774,7 @@ class TestStatisticsSortOrder : public ::testing::Test {
 };
 
 using CompareTestTypes = ::testing::Types<Int32Type, Int64Type, FloatType, DoubleType,
-                                          ByteArrayType, FLBAType>;
+                                          ByteArrayType, FLBAType, Float16LogicalType>;
 
 // TYPE::INT32
 template <>
@@ -907,6 +921,36 @@ void TestStatisticsSortOrder<FLBAType>::SetValues() {
       .set_max(std::string(reinterpret_cast<const char*>(&vals[8][0]), FLBA_LENGTH));
 }
 
+template <>
+void TestStatisticsSortOrder<Float16LogicalType>::AddNodes(std::string name) {
+  auto node =
+      schema::PrimitiveNode::Make(name, Repetition::REQUIRED, LogicalType::Float16(),
+                                  Type::FIXED_LEN_BYTE_ARRAY, sizeof(uint16_t));
+  fields_.push_back(std::move(node));
+}
+
+template <>
+void TestStatisticsSortOrder<Float16LogicalType>::SetValues() {
+  constexpr int kValueLen = 2;
+  constexpr int kNumBytes = NUM_VALUES * kValueLen;
+
+  const uint16_t packed_vals[NUM_VALUES] = {
+      0b0000000000000000, 0b0000000000000000, 0b1000000000000000, 0b1000010000000000,
+      0b0111110000001000, 0b1000000000000000, 0b0000010000000000, 0b0000000001000000,
+      0b1111110000001000, 0b1000000001000000};
+
+  values_buf_.resize(kNumBytes);
+  uint8_t* ptr = values_buf_.data();
+  for (int i = 0; i < NUM_VALUES; ++i) {
+    values_[i].ptr = float16::Unpack(packed_vals[i], ptr);
+    ptr += kValueLen;
+  }
+
+  stats_[0]
+      .set_min(std::string(reinterpret_cast<const char*>(values_[3].ptr), kValueLen))
+      .set_max(std::string(reinterpret_cast<const char*>(values_[6].ptr), kValueLen));
+}
+
 TYPED_TEST_SUITE(TestStatisticsSortOrder, CompareTestTypes);
 
 TYPED_TEST(TestStatisticsSortOrder, MinMax) {
@@ -972,12 +1016,20 @@ TEST_F(TestStatisticsSortOrderFLBA, UnknownSortOrder) {
   ASSERT_FALSE(cc_metadata->is_stats_set());
 }
 
+template <typename T>
+static std::string EncodeValue(const T& val) {
+  return std::string(reinterpret_cast<const char*>(&val), sizeof(val));
+}
+static std::string EncodeValue(const FLBA& val, int length = sizeof(uint16_t)) {
+  return std::string(reinterpret_cast<const char*>(val.ptr), length);
+}
+
 template <typename Stats, typename Array, typename T = typename Array::value_type>
 void AssertMinMaxAre(Stats stats, const Array& values, T expected_min, T expected_max) {
   stats->Update(values.data(), values.size(), 0);
   ASSERT_TRUE(stats->HasMinMax());
-  EXPECT_EQ(stats->min(), expected_min);
-  EXPECT_EQ(stats->max(), expected_max);
+  EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min));
+  EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max));
 }
 
 template <typename Stats, typename Array, typename T = typename Stats::T>
@@ -989,8 +1041,8 @@ void AssertMinMaxAre(Stats stats, const Array& values, const uint8_t* valid_bitm
   stats->UpdateSpaced(values.data(), valid_bitmap, 0, non_null_count + null_count,
                       non_null_count, null_count);
   ASSERT_TRUE(stats->HasMinMax());
-  EXPECT_EQ(stats->min(), expected_min);
-  EXPECT_EQ(stats->max(), expected_max);
+  EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min));
+  EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max));
 }
 
 template <typename Stats, typename Array>
@@ -1073,50 +1125,217 @@ void CheckExtrema() {
 TEST(TestStatistic, Int32Extrema) { CheckExtrema<Int32Type>(); }
 TEST(TestStatistic, Int64Extrema) { CheckExtrema<Int64Type>(); }
 
-// PARQUET-1225: Float NaN values may lead to incorrect min-max
-template <typename ParquetType>
-void CheckNaNs() {
-  using T = typename ParquetType::c_type;
+template <typename T>
+class TestFloatStatistics : public ::testing::Test {
+ public:
+  using ParquetType = typename RebindLogical<T>::ParquetType;
+  using c_type = typename ParquetType::c_type;
+
+  void Init();
+  void SetUp() override { this->Init(); }
+
+  bool signbit(c_type val);
+  void CheckEq(const c_type& l, const c_type& r);
+  NodePtr MakeNode(const std::string& name, Repetition::type rep);
+
+  template <typename Stats, typename Values>
+  void CheckMinMaxZeroesSign(Stats stats, const Values& values) {
+    stats->Update(values.data(), values.size(), 0);
+    ASSERT_TRUE(stats->HasMinMax());
+
+    this->CheckEq(stats->min(), positive_zero_);
+    ASSERT_TRUE(this->signbit(stats->min()));
+
+    this->CheckEq(stats->max(), positive_zero_);
+    ASSERT_FALSE(this->signbit(stats->max()));
+  }
+
+  // ARROW-5562: Ensure that -0.0f and 0.0f values are properly handled like in
+  // parquet-mr
+  void TestNegativeZeroes() {
+    NodePtr node = this->MakeNode("f", Repetition::OPTIONAL);
+    ColumnDescriptor descr(node, 1, 1);
+
+    {
+      std::array<c_type, 2> values{negative_zero_, positive_zero_};
+      auto stats = MakeStatistics<ParquetType>(&descr);
+      CheckMinMaxZeroesSign(stats, values);
+    }
+
+    {
+      std::array<c_type, 2> values{positive_zero_, negative_zero_};
+      auto stats = MakeStatistics<ParquetType>(&descr);
+      CheckMinMaxZeroesSign(stats, values);
+    }
+
+    {
+      std::array<c_type, 2> values{negative_zero_, negative_zero_};
+      auto stats = MakeStatistics<ParquetType>(&descr);
+      CheckMinMaxZeroesSign(stats, values);
+    }
+
+    {
+      std::array<c_type, 2> values{positive_zero_, positive_zero_};
+      auto stats = MakeStatistics<ParquetType>(&descr);
+      CheckMinMaxZeroesSign(stats, values);
+    }
+  }
+
+  // PARQUET-1225: Float NaN values may lead to incorrect min-max
+  template <typename Values>
+  void CheckNaNs(ColumnDescriptor* descr, const Values& all_nans, const Values& some_nans,
+                 const Values& other_nans, c_type min, c_type max, uint8_t valid_bitmap,
+                 uint8_t valid_bitmap_no_nans) {
+    auto some_nan_stats = MakeStatistics<ParquetType>(descr);
+    // Ingesting only nans should not yield valid min max
+    AssertUnsetMinMax(some_nan_stats, all_nans);
+    // Ingesting a mix of NaNs and non-NaNs should not yield valid min max.
+    AssertMinMaxAre(some_nan_stats, some_nans, min, max);
+    // Ingesting only nans after a valid min/max, should have not effect
+    AssertMinMaxAre(some_nan_stats, all_nans, min, max);
+
+    some_nan_stats = MakeStatistics<ParquetType>(descr);
+    AssertUnsetMinMax(some_nan_stats, all_nans, &valid_bitmap);
+    // NaNs should not pollute min max when excluded via null bitmap.
+    AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap_no_nans, min, max);
+    // Ingesting NaNs with a null bitmap should not change the result.
+    AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap, min, max);
+
+    // An array that doesn't start with NaN
+    auto other_stats = MakeStatistics<ParquetType>(descr);
+    AssertMinMaxAre(other_stats, other_nans, min, max);
+  }
+
+  void TestNaNs();
+
+ protected:
+  std::vector<uint8_t> data_buf_;
+  c_type positive_zero_;
+  c_type negative_zero_;
+};
+
+template <typename T>
+void TestFloatStatistics<T>::Init() {
+  positive_zero_ = c_type{};
+  negative_zero_ = -positive_zero_;
+}
+template <>
+void TestFloatStatistics<Float16LogicalType>::Init() {
+  positive_zero_ = c_type{float16::positive_zero_ptr()};
+  negative_zero_ = c_type{float16::negative_zero_ptr()};
+}
+
+template <typename T>
+NodePtr TestFloatStatistics<T>::MakeNode(const std::string& name, Repetition::type rep) {
+  return PrimitiveNode::Make(name, rep, ParquetType::type_num);
+}
+template <>
+NodePtr TestFloatStatistics<Float16LogicalType>::MakeNode(const std::string& name,
+                                                          Repetition::type rep) {
+  return PrimitiveNode::Make(name, rep, LogicalType::Float16(),
+                             Type::FIXED_LEN_BYTE_ARRAY, 2);
+}
 
+template <typename T>
+void TestFloatStatistics<T>::CheckEq(const c_type& l, const c_type& r) {
+  ASSERT_EQ(l, r);
+}
+template <>
+void TestFloatStatistics<Float16LogicalType>::CheckEq(const c_type& a, const c_type& b) {
+  auto l = float16::Pack(a);
+  auto r = float16::Pack(b);
+  if (float16::is_zero(l) && float16::is_zero(r)) return;
+  ASSERT_EQ(l, r);
+}
+
+template <typename T>
+bool TestFloatStatistics<T>::signbit(c_type val) {
+  return std::signbit(val);
+}
+template <>
+bool TestFloatStatistics<Float16LogicalType>::signbit(c_type val) {
+  return float16::signbit(float16::Pack(val));
+}
+
+template <typename T>
+void TestFloatStatistics<T>::TestNaNs() {
   constexpr int kNumValues = 8;
-  NodePtr node = PrimitiveNode::Make("f", Repetition::OPTIONAL, ParquetType::type_num);
+  NodePtr node = this->MakeNode("f", Repetition::OPTIONAL);
   ColumnDescriptor descr(node, 1, 1);
 
-  constexpr T nan = std::numeric_limits<T>::quiet_NaN();
-  constexpr T min = -4.0f;
-  constexpr T max = 3.0f;
+  constexpr c_type nan = std::numeric_limits<c_type>::quiet_NaN();
+  constexpr c_type min = -4.0f;
+  constexpr c_type max = 3.0f;
+
+  std::array<c_type, kNumValues> all_nans{nan, nan, nan, nan, nan, nan, nan, nan};
+  std::array<c_type, kNumValues> some_nans{nan, max, -3.0f, -1.0f, nan, 2.0f, min, nan};
+  std::array<c_type, kNumValues> other_nans{1.5f, max, -3.0f, -1.0f, nan, 2.0f, min, nan};
 
-  std::array<T, kNumValues> all_nans{nan, nan, nan, nan, nan, nan, nan, nan};
-  std::array<T, kNumValues> some_nans{nan, max, -3.0f, -1.0f, nan, 2.0f, min, nan};
   uint8_t valid_bitmap = 0x7F;  // 0b01111111
   // NaNs excluded
   uint8_t valid_bitmap_no_nans = 0x6E;  // 0b01101110
 
-  // Test values
-  auto some_nan_stats = MakeStatistics<ParquetType>(&descr);
-  // Ingesting only nans should not yield valid min max
-  AssertUnsetMinMax(some_nan_stats, all_nans);
-  // Ingesting a mix of NaNs and non-NaNs should not yield valid min max.
-  AssertMinMaxAre(some_nan_stats, some_nans, min, max);
-  // Ingesting only nans after a valid min/max, should have not effect
-  AssertMinMaxAre(some_nan_stats, all_nans, min, max);
+  this->CheckNaNs(&descr, all_nans, some_nans, other_nans, min, max, valid_bitmap,
+                  valid_bitmap_no_nans);
+}
+
+template <>
+void TestFloatStatistics<Float16LogicalType>::TestNaNs() {
+  constexpr int kNumValues = 8;
+  constexpr int kValueLen = sizeof(uint16_t);
+
+  NodePtr node = this->MakeNode("f", Repetition::OPTIONAL);
+  ColumnDescriptor descr(node, 1, 1);
+
+  const uint16_t nan_int = 0b1111110010101010;
+  const uint16_t min_int = 0b1010010111000110;
+  const uint16_t max_int = 0b0011100011010011;
+  uint8_t min_max_data[2 * kValueLen];
+  const auto min = FLBA{float16::Unpack(min_int, &min_max_data[0 * kValueLen])};
+  const auto max = FLBA{float16::Unpack(max_int, &min_max_data[1 * kValueLen])};
+
+  std::array<uint16_t, kNumValues> all_nans_packed = {nan_int, nan_int, nan_int, nan_int,
+                                                      nan_int, nan_int, nan_int, nan_int};
+  std::array<uint16_t, kNumValues> some_nans_packed = {nan_int,
+                                                       max_int,
+                                                       0b1000111000110000,
+                                                       0b1000010001000001,
+                                                       nan_int,
+                                                       0b0000100000011110,
+                                                       min_int,
+                                                       nan_int};
+  std::array<uint16_t, kNumValues> other_nans_packed = some_nans_packed;
+  other_nans_packed[0] = 0b0000010000110011;
+
+  std::array<uint8_t, (kNumValues * kValueLen * 3)> bytes;
+  uint8_t* at = bytes.data();
+  auto prepare_values = [&](const auto& packed_values) -> std::vector<FLBA> {
+    std::vector<FLBA> out;
+    for (uint16_t packed : packed_values) {
+      out.push_back(FLBA{float16::Unpack(packed, at)});
+      at += kValueLen;
+    }
+    return out;
+  };
+
+  auto all_nans = prepare_values(all_nans_packed);
+  auto some_nans = prepare_values(some_nans_packed);
+  auto other_nans = prepare_values(other_nans_packed);
 
-  some_nan_stats = MakeStatistics<ParquetType>(&descr);
-  AssertUnsetMinMax(some_nan_stats, all_nans, &valid_bitmap);
-  // NaNs should not pollute min max when excluded via null bitmap.
-  AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap_no_nans, min, max);
-  // Ingesting NaNs with a null bitmap should not change the result.
-  AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap, min, max);
+  uint8_t valid_bitmap = 0x7F;  // 0b01111111
+  // NaNs excluded
+  uint8_t valid_bitmap_no_nans = 0x6E;  // 0b01101110
 
-  // An array that doesn't start with NaN
-  std::array<T, kNumValues> other_nans{1.5f, max, -3.0f, -1.0f, nan, 2.0f, min, nan};
-  auto other_stats = MakeStatistics<ParquetType>(&descr);
-  AssertMinMaxAre(other_stats, other_nans, min, max);
+  this->CheckNaNs(&descr, all_nans, some_nans, other_nans, min, max, valid_bitmap,
+                  valid_bitmap_no_nans);
 }
 
-TEST(TestStatistic, NaNFloatValues) { CheckNaNs<FloatType>(); }
+using FloatingPointTypes = ::testing::Types<FloatType, DoubleType, Float16LogicalType>;
+
+TYPED_TEST_SUITE(TestFloatStatistics, FloatingPointTypes);
 
-TEST(TestStatistic, NaNDoubleValues) { CheckNaNs<DoubleType>(); }
+TYPED_TEST(TestFloatStatistics, NegativeZeros) { this->TestNegativeZeroes(); }
+TYPED_TEST(TestFloatStatistics, NaNs) { this->TestNaNs(); }
 
 // ARROW-7376
 TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) {
@@ -1132,58 +1351,6 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) {
   AssertUnsetMinMax(stats, nans_but_last, &all_but_last_valid);
 }
 
-template <typename Stats, typename Array, typename T = typename Array::value_type>
-void AssertMinMaxZeroesSign(Stats stats, const Array& values) {
-  stats->Update(values.data(), values.size(), 0);
-  ASSERT_TRUE(stats->HasMinMax());
-
-  T zero{};
-  ASSERT_EQ(stats->min(), zero);
-  ASSERT_TRUE(std::signbit(stats->min()));
-
-  ASSERT_EQ(stats->max(), zero);
-  ASSERT_FALSE(std::signbit(stats->max()));
-}
-
-// ARROW-5562: Ensure that -0.0f and 0.0f values are properly handled like in
-// parquet-mr
-template <typename ParquetType>
-void CheckNegativeZeroStats() {
-  using T = typename ParquetType::c_type;
-
-  NodePtr node = PrimitiveNode::Make("f", Repetition::OPTIONAL, ParquetType::type_num);
-  ColumnDescriptor descr(node, 1, 1);
-  T zero{};
-
-  {
-    std::array<T, 2> values{-zero, zero};
-    auto stats = MakeStatistics<ParquetType>(&descr);
-    AssertMinMaxZeroesSign(stats, values);
-  }
-
-  {
-    std::array<T, 2> values{zero, -zero};
-    auto stats = MakeStatistics<ParquetType>(&descr);
-    AssertMinMaxZeroesSign(stats, values);
-  }
-
-  {
-    std::array<T, 2> values{-zero, -zero};
-    auto stats = MakeStatistics<ParquetType>(&descr);
-    AssertMinMaxZeroesSign(stats, values);
-  }
-
-  {
-    std::array<T, 2> values{zero, zero};
-    auto stats = MakeStatistics<ParquetType>(&descr);
-    AssertMinMaxZeroesSign(stats, values);
-  }
-}
-
-TEST(TestStatistics, FloatNegativeZero) { CheckNegativeZeroStats<FloatType>(); }
-
-TEST(TestStatistics, DoubleNegativeZero) { CheckNegativeZeroStats<DoubleType>(); }
-
 // Test statistics for binary column with UNSIGNED sort order
 TEST(TestStatisticsSortOrderMinMax, Unsigned) {
   std::string dir_string(test::get_data_dir());