From f676360b0c39d42bed4c97ca186f3ccb6d41b4fd Mon Sep 17 00:00:00 2001
From: Bikramjeet Vig <bikramjeet@meta.com>
Date: Wed, 24 Jul 2024 11:48:14 -0700
Subject: [PATCH] Fix MultiRange filter to correctly handle NaN input (#10535)

Summary:
Pull Request resolved: https://github.com/facebookincubator/velox/pull/10535

The current MultiRange filter includes an extra flag specifically for
allowing or filtering out NaN values, independent of its internal
filters. This responsibility should instead be managed by the filters
themselves. This change therefore removes that additional handling to
ensure consistency between using a single filter VS using multiple.

Reviewed By: Yuhta

Differential Revision: D60138398
---
 velox/connectors/hive/HiveConnectorUtil.cpp   |  2 +-
 velox/expression/ExprToSubfieldFilter.cpp     |  3 +-
 velox/expression/ExprToSubfieldFilter.h       | 10 ++-----
 velox/type/Filter.cpp                         | 28 +++++--------------
 velox/type/Filter.h                           | 15 +++-------
 velox/type/tests/FilterSerDeTest.cpp          |  2 +-
 velox/type/tests/FilterTest.cpp               | 28 ++++++-------------
 .../type/tests/NegatedBytesRangeBenchmark.cpp |  4 +--
 .../tests/NegatedBytesValuesBenchmark.cpp     |  2 +-
 9 files changed, 28 insertions(+), 66 deletions(-)
diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp
index 7a3004db1a38..1322c85e3771 100644
--- a/velox/connectors/hive/HiveConnectorUtil.cpp
+++ b/velox/connectors/hive/HiveConnectorUtil.cpp
@@ -91,7 +91,7 @@ std::unique_ptr<common::Filter> makeFloatingPointMapKeyFilter(
   if (filters.size() == 1) {
     return std::move(filters[0]);
   }
-  return std::make_unique<common::MultiRange>(std::move(filters), false, false);
+  return std::make_unique<common::MultiRange>(std::move(filters), false);
 }
 
 // Recursively add subfields to scan spec.
diff --git a/velox/expression/ExprToSubfieldFilter.cpp b/velox/expression/ExprToSubfieldFilter.cpp
index 8760dda6e693..f876c5de3fa8 100644
--- a/velox/expression/ExprToSubfieldFilter.cpp
+++ b/velox/expression/ExprToSubfieldFilter.cpp
@@ -336,8 +336,7 @@ std::unique_ptr<common::Filter> makeNotEqualFilter(
     std::vector<std::unique_ptr<common::Filter>> filters;
     filters.emplace_back(std::move(lessThanFilter));
     filters.emplace_back(std::move(greaterThanFilter));
-    return std::make_unique<common::MultiRange>(
-        std::move(filters), false, false);
+    return std::make_unique<common::MultiRange>(std::move(filters), false);
   }
 }
 
diff --git a/velox/expression/ExprToSubfieldFilter.h b/velox/expression/ExprToSubfieldFilter.h
index 193e3413ea25..f87d28e2530b 100644
--- a/velox/expression/ExprToSubfieldFilter.h
+++ b/velox/expression/ExprToSubfieldFilter.h
@@ -320,16 +320,12 @@ inline std::unique_ptr<common::IsNotNull> isNotNull() {
 }
 
 template <typename T>
-std::unique_ptr<common::MultiRange> orFilter(
-    std::unique_ptr<T> a,
-    std::unique_ptr<T> b,
-    bool nullAllowed = false,
-    bool nanAllowed = false) {
+std::unique_ptr<common::MultiRange>
+orFilter(std::unique_ptr<T> a, std::unique_ptr<T> b, bool nullAllowed = false) {
   std::vector<std::unique_ptr<common::Filter>> filters;
   filters.emplace_back(std::move(a));
   filters.emplace_back(std::move(b));
-  return std::make_unique<common::MultiRange>(
-      std::move(filters), nullAllowed, nanAllowed);
+  return std::make_unique<common::MultiRange>(std::move(filters), nullAllowed);
 }
 
 inline std::unique_ptr<common::HugeintRange> lessThanHugeint(
diff --git a/velox/type/Filter.cpp b/velox/type/Filter.cpp
index 07d64b3b0763..d0f6932b4aad 100644
--- a/velox/type/Filter.cpp
+++ b/velox/type/Filter.cpp
@@ -712,7 +712,6 @@ bool NegatedBytesValues::testingEquals(const Filter& other) const {
 
 folly::dynamic MultiRange::serialize() const {
   auto obj = Filter::serializeBase("MultiRange");
-  obj["nanAllowed"] = nanAllowed_;
   folly::dynamic arr = folly::dynamic::array;
   for (const auto& f : filters_) {
     arr.push_back(f->serialize());
@@ -723,7 +722,6 @@ folly::dynamic MultiRange::serialize() const {
 
 FilterPtr MultiRange::create(const folly::dynamic& obj) {
   auto nullAllowed = deserializeNullAllowed(obj);
-  auto nanAllowed = obj["nanAllowed"].asBool();
   folly::dynamic arr = obj["filters"];
   auto tmpFilters = ISerializable::deserialize<std::vector<Filter>>(arr);
 
@@ -733,14 +731,13 @@ FilterPtr MultiRange::create(const folly::dynamic& obj) {
     filters.emplace_back(f->clone());
   }
 
-  return std::make_unique<MultiRange>(
-      std::move(filters), nullAllowed, nanAllowed);
+  return std::make_unique<MultiRange>(std::move(filters), nullAllowed);
 }
 
 bool MultiRange::testingEquals(const Filter& other) const {
   auto otherMultiRange = dynamic_cast<const MultiRange*>(&other);
   auto res = otherMultiRange != nullptr && Filter::testingBaseEquals(other) &&
-      nanAllowed_ == otherMultiRange->nanAllowed_ &&
+
       filters_.size() == otherMultiRange->filters_.size();
 
   if (!res) {
@@ -1357,7 +1354,7 @@ std::unique_ptr<Filter> NegatedBytesRange::toMultiRange() const {
   if (accepted.size() == 1) {
     return accepted[0]->clone(nullAllowed_);
   }
-  return std::make_unique<MultiRange>(std::move(accepted), nullAllowed_, false);
+  return std::make_unique<MultiRange>(std::move(accepted), nullAllowed_);
 }
 
 bool NegatedBytesValues::testBytesRange(
@@ -1439,17 +1436,13 @@ std::unique_ptr<Filter> MultiRange::clone(
 
   if (nullAllowed) {
     return std::make_unique<MultiRange>(
-        std::move(filters), nullAllowed.value(), nanAllowed_);
+        std::move(filters), nullAllowed.value());
   } else {
-    return std::make_unique<MultiRange>(
-        std::move(filters), nullAllowed_, nanAllowed_);
+    return std::make_unique<MultiRange>(std::move(filters), nullAllowed_);
   }
 }
 
 bool MultiRange::testDouble(double value) const {
-  if (std::isnan(value)) {
-    return nanAllowed_;
-  }
   for (const auto& filter : filters_) {
     if (filter->testDouble(value)) {
       return true;
@@ -1459,9 +1452,6 @@ bool MultiRange::testDouble(double value) const {
 }
 
 bool MultiRange::testFloat(float value) const {
-  if (std::isnan(value)) {
-    return nanAllowed_;
-  }
   for (const auto& filter : filters_) {
     if (filter->testFloat(value)) {
       return true;
@@ -1554,7 +1544,6 @@ std::unique_ptr<Filter> MultiRange::mergeWith(const Filter* other) const {
     case FilterKind::kBytesRange:
     case FilterKind::kMultiRange: {
       bool bothNullAllowed = nullAllowed_ && other->testNull();
-      bool bothNanAllowed = nanAllowed_;
       std::vector<const Filter*> otherFilters;
 
       if (other->kind() == FilterKind::kMultiRange) {
@@ -1562,7 +1551,6 @@ std::unique_ptr<Filter> MultiRange::mergeWith(const Filter* other) const {
         for (auto const& filterOther : multiRangeOther->filters()) {
           otherFilters.emplace_back(filterOther.get());
         }
-        bothNanAllowed = bothNanAllowed && multiRangeOther->nanAllowed();
       } else {
         otherFilters.emplace_back(other);
       }
@@ -1614,8 +1602,7 @@ std::unique_ptr<Filter> MultiRange::mergeWith(const Filter* other) const {
       } else if (merged.size() == 1) {
         return merged.front()->clone(bothNullAllowed);
       } else {
-        return std::make_unique<MultiRange>(
-            std::move(merged), bothNullAllowed, bothNanAllowed);
+        return std::make_unique<MultiRange>(std::move(merged), bothNullAllowed);
       }
     }
     default:
@@ -2637,8 +2624,7 @@ std::unique_ptr<Filter> NegatedBytesValues::mergeWith(
           bytesRangeOther->upperUnbounded(),
           hiExclusive,
           false));
-      return std::make_unique<MultiRange>(
-          std::move(ranges), bothNullAllowed, false);
+      return std::make_unique<MultiRange>(std::move(ranges), bothNullAllowed);
     }
     default:
       VELOX_UNREACHABLE();
diff --git a/velox/type/Filter.h b/velox/type/Filter.h
index c60fb98d9df4..4a84992f71d0 100644
--- a/velox/type/Filter.h
+++ b/velox/type/Filter.h
@@ -2042,16 +2042,14 @@ class MultiRange final : public Filter {
   /// All entries must support the same data types.
   /// @param nullAllowed Null values are passing the filter if true. nullAllowed
   /// flags in the 'ranges' filters are ignored.
-  /// @param nanAllowed Not-a-Number floating point values are passing the
-  /// filter if true. Applies to floating point data types only. NaN values are
-  /// not further tested using contained filters.
+  /// TODO: remove redundant param `nanAllowed` after presto removes the use of
+  /// this param. For now, we set a default value to avoid breaking presto.
   MultiRange(
       std::vector<std::unique_ptr<Filter>> filters,
       bool nullAllowed,
-      bool nanAllowed)
+      bool nanAllowed = false)
       : Filter(true, nullAllowed, FilterKind::kMultiRange),
-        filters_(std::move(filters)),
-        nanAllowed_(nanAllowed) {}
+        filters_(std::move(filters)) {}
 
   folly::dynamic serialize() const override;
 
@@ -2083,15 +2081,10 @@ class MultiRange final : public Filter {
 
   std::unique_ptr<Filter> mergeWith(const Filter* other) const override final;
 
-  bool nanAllowed() const {
-    return nanAllowed_;
-  }
-
   bool testingEquals(const Filter& other) const final;
 
  private:
   const std::vector<std::unique_ptr<Filter>> filters_;
-  const bool nanAllowed_;
 };
 
 // Helper for applying filters to different types
diff --git a/velox/type/tests/FilterSerDeTest.cpp b/velox/type/tests/FilterSerDeTest.cpp
index 03bd2dd363a0..0b4c52c1a902 100644
--- a/velox/type/tests/FilterSerDeTest.cpp
+++ b/velox/type/tests/FilterSerDeTest.cpp
@@ -152,7 +152,7 @@ TEST_F(FilterSerDeTest, multiFilter) {
   filters.emplace_back(std::make_unique<BytesRange>(
       "ABCD", true, true, "FFFF", false, true, false));
 
-  MultiRange multiRange(std::move(filters), true, true);
+  MultiRange multiRange(std::move(filters), true);
   testSerde(multiRange);
 }
 
diff --git a/velox/type/tests/FilterTest.cpp b/velox/type/tests/FilterTest.cpp
index b0e7ed5f0162..339b678adba3 100644
--- a/velox/type/tests/FilterTest.cpp
+++ b/velox/type/tests/FilterTest.cpp
@@ -1050,7 +1050,7 @@ TEST(FilterTest, multiRange) {
   EXPECT_TRUE(filter->testDouble(1.3));
 
   EXPECT_FALSE(filter->testNull());
-  EXPECT_FALSE(filter->testDouble(std::nan("nan")));
+  EXPECT_TRUE(filter->testDouble(std::nan("nan")));
   EXPECT_FALSE(filter->testDouble(1.2));
 
   filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2));
@@ -1059,7 +1059,7 @@ TEST(FilterTest, multiRange) {
   EXPECT_TRUE(filter->testFloat(1.1f));
   EXPECT_FALSE(filter->testFloat(1.2f));
   EXPECT_TRUE(filter->testFloat(1.3f));
-  EXPECT_FALSE(filter->testFloat(std::nanf("nan")));
+  EXPECT_TRUE(filter->testFloat(std::nanf("nan")));
 
   // != ''
   filter = orFilter(lessThan(""), greaterThan(""));
@@ -1069,51 +1069,39 @@ TEST(FilterTest, multiRange) {
 
 TEST(FilterTest, multiRangeWithNaNs) {
   // x <> 1.2 with nanAllowed true
-  auto filter =
-      orFilter(lessThanFloat(1.2), greaterThanFloat(1.2), false, true);
+  auto filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2), false);
   EXPECT_TRUE(filter->testFloat(std::nanf("nan")));
   EXPECT_FALSE(filter->testFloat(1.2f));
   EXPECT_TRUE(filter->testFloat(1.1f));
 
-  filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2), false, true);
+  filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2), false);
   EXPECT_TRUE(filter->testDouble(std::nan("nan")));
   EXPECT_FALSE(filter->testDouble(1.2));
   EXPECT_TRUE(filter->testDouble(1.1));
 
   // x <> 1.2 with nanAllowed false
   filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2));
-  EXPECT_FALSE(filter->testFloat(std::nanf("nan")));
+  EXPECT_TRUE(filter->testFloat(std::nanf("nan")));
   EXPECT_TRUE(filter->testFloat(1.0f));
 
   filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2));
-  EXPECT_FALSE(filter->testDouble(std::nan("nan")));
+  EXPECT_TRUE(filter->testDouble(std::nan("nan")));
   EXPECT_TRUE(filter->testDouble(1.4));
 
   // x NOT IN (1.2, 1.3) with nanAllowed true
-  filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.3), false, true);
+  filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.3), false);
   EXPECT_TRUE(filter->testFloat(std::nanf("nan")));
   EXPECT_FALSE(filter->testFloat(1.2f));
   EXPECT_FALSE(filter->testFloat(1.3f));
   EXPECT_TRUE(filter->testFloat(1.4f));
   EXPECT_TRUE(filter->testFloat(1.1f));
 
-  filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.3), false, true);
+  filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.3), false);
   EXPECT_TRUE(filter->testDouble(std::nan("nan")));
   EXPECT_FALSE(filter->testDouble(1.2));
   EXPECT_FALSE(filter->testDouble(1.3));
   EXPECT_TRUE(filter->testDouble(1.4));
   EXPECT_TRUE(filter->testDouble(1.1));
-
-  // x NOT IN (1.2) with nanAllowed false
-  filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2));
-  EXPECT_FALSE(filter->testFloat(std::nanf("nan")));
-  EXPECT_FALSE(filter->testFloat(1.2f));
-  EXPECT_TRUE(filter->testFloat(1.3f));
-
-  filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2));
-  EXPECT_FALSE(filter->testDouble(std::nan("nan")));
-  EXPECT_FALSE(filter->testDouble(1.2));
-  EXPECT_TRUE(filter->testDouble(1.3));
 }
 
 TEST(FilterTest, createBigintValues) {
diff --git a/velox/type/tests/NegatedBytesRangeBenchmark.cpp b/velox/type/tests/NegatedBytesRangeBenchmark.cpp
index 3ed0b3cef249..8459e6db74e6 100644
--- a/velox/type/tests/NegatedBytesRangeBenchmark.cpp
+++ b/velox/type/tests/NegatedBytesRangeBenchmark.cpp
@@ -169,8 +169,8 @@ int32_t main(int32_t argc, char** argv) {
           "", true, false, lo, false, true, false));
       rangeFilters.emplace_back(std::make_unique<common::BytesRange>(
           hi, false, false, "", true, false, false));
-      multiRanges.emplace_back(std::make_unique<common::MultiRange>(
-          std::move(rangeFilters), false, false));
+      multiRanges.emplace_back(
+          std::make_unique<common::MultiRange>(std::move(rangeFilters), false));
 
       LOG(INFO) << "Generated filter for length " << len << " with percentage "
                 << pct;
diff --git a/velox/type/tests/NegatedBytesValuesBenchmark.cpp b/velox/type/tests/NegatedBytesValuesBenchmark.cpp
index cac1d449f904..78da7f714dca 100644
--- a/velox/type/tests/NegatedBytesValuesBenchmark.cpp
+++ b/velox/type/tests/NegatedBytesValuesBenchmark.cpp
@@ -178,7 +178,7 @@ int32_t main(int32_t argc, char* argv[]) {
       range_filters.emplace_back(std::make_unique<common::BytesRange>(
           *back, false, true, "", true, true, false));
       multi_ranges.emplace_back(std::make_unique<common::MultiRange>(
-          std::move(range_filters), false, false));
+          std::move(range_filters), false));
 
       LOG(INFO) << "Generated filter for length " << len << " with size "
                 << size;