From 2391e9c85d819c2f0daf801d4718116cb87deda6 Mon Sep 17 00:00:00 2001 From: Bikramjeet Vig Date: Wed, 24 Jul 2024 11:36:03 -0700 Subject: [PATCH] Fix MultiRange filter to correctly handle NaN input (#10535) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/10535 The current MultiRange filter includes an extra flag specifically for allowing or filtering out NaN values, independent of its internal filters. This responsibility should instead be managed by the filters themselves. This change therefore removes that additional handling to ensure consistency between using a single filter VS using multiple. Reviewed By: Yuhta Differential Revision: D60138398 --- velox/connectors/hive/HiveConnectorUtil.cpp | 2 +- velox/expression/ExprToSubfieldFilter.cpp | 3 +- velox/expression/ExprToSubfieldFilter.h | 10 ++----- velox/type/Filter.cpp | 28 +++++-------------- velox/type/Filter.h | 15 +++------- velox/type/tests/FilterSerDeTest.cpp | 2 +- velox/type/tests/FilterTest.cpp | 28 ++++++------------- .../type/tests/NegatedBytesRangeBenchmark.cpp | 4 +-- .../tests/NegatedBytesValuesBenchmark.cpp | 2 +- 9 files changed, 28 insertions(+), 66 deletions(-) diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index 7a3004db1a383..1322c85e37710 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -91,7 +91,7 @@ std::unique_ptr makeFloatingPointMapKeyFilter( if (filters.size() == 1) { return std::move(filters[0]); } - return std::make_unique(std::move(filters), false, false); + return std::make_unique(std::move(filters), false); } // Recursively add subfields to scan spec. diff --git a/velox/expression/ExprToSubfieldFilter.cpp b/velox/expression/ExprToSubfieldFilter.cpp index 8760dda6e693f..f876c5de3fa84 100644 --- a/velox/expression/ExprToSubfieldFilter.cpp +++ b/velox/expression/ExprToSubfieldFilter.cpp @@ -336,8 +336,7 @@ std::unique_ptr makeNotEqualFilter( std::vector> filters; filters.emplace_back(std::move(lessThanFilter)); filters.emplace_back(std::move(greaterThanFilter)); - return std::make_unique( - std::move(filters), false, false); + return std::make_unique(std::move(filters), false); } } diff --git a/velox/expression/ExprToSubfieldFilter.h b/velox/expression/ExprToSubfieldFilter.h index 193e3413ea25a..f87d28e2530bc 100644 --- a/velox/expression/ExprToSubfieldFilter.h +++ b/velox/expression/ExprToSubfieldFilter.h @@ -320,16 +320,12 @@ inline std::unique_ptr isNotNull() { } template -std::unique_ptr orFilter( - std::unique_ptr a, - std::unique_ptr b, - bool nullAllowed = false, - bool nanAllowed = false) { +std::unique_ptr +orFilter(std::unique_ptr a, std::unique_ptr b, bool nullAllowed = false) { std::vector> filters; filters.emplace_back(std::move(a)); filters.emplace_back(std::move(b)); - return std::make_unique( - std::move(filters), nullAllowed, nanAllowed); + return std::make_unique(std::move(filters), nullAllowed); } inline std::unique_ptr lessThanHugeint( diff --git a/velox/type/Filter.cpp b/velox/type/Filter.cpp index 07d64b3b07630..d0f6932b4aad9 100644 --- a/velox/type/Filter.cpp +++ b/velox/type/Filter.cpp @@ -712,7 +712,6 @@ bool NegatedBytesValues::testingEquals(const Filter& other) const { folly::dynamic MultiRange::serialize() const { auto obj = Filter::serializeBase("MultiRange"); - obj["nanAllowed"] = nanAllowed_; folly::dynamic arr = folly::dynamic::array; for (const auto& f : filters_) { arr.push_back(f->serialize()); @@ -723,7 +722,6 @@ folly::dynamic MultiRange::serialize() const { FilterPtr MultiRange::create(const folly::dynamic& obj) { auto nullAllowed = deserializeNullAllowed(obj); - auto nanAllowed = obj["nanAllowed"].asBool(); folly::dynamic arr = obj["filters"]; auto tmpFilters = ISerializable::deserialize>(arr); @@ -733,14 +731,13 @@ FilterPtr MultiRange::create(const folly::dynamic& obj) { filters.emplace_back(f->clone()); } - return std::make_unique( - std::move(filters), nullAllowed, nanAllowed); + return std::make_unique(std::move(filters), nullAllowed); } bool MultiRange::testingEquals(const Filter& other) const { auto otherMultiRange = dynamic_cast(&other); auto res = otherMultiRange != nullptr && Filter::testingBaseEquals(other) && - nanAllowed_ == otherMultiRange->nanAllowed_ && + filters_.size() == otherMultiRange->filters_.size(); if (!res) { @@ -1357,7 +1354,7 @@ std::unique_ptr NegatedBytesRange::toMultiRange() const { if (accepted.size() == 1) { return accepted[0]->clone(nullAllowed_); } - return std::make_unique(std::move(accepted), nullAllowed_, false); + return std::make_unique(std::move(accepted), nullAllowed_); } bool NegatedBytesValues::testBytesRange( @@ -1439,17 +1436,13 @@ std::unique_ptr MultiRange::clone( if (nullAllowed) { return std::make_unique( - std::move(filters), nullAllowed.value(), nanAllowed_); + std::move(filters), nullAllowed.value()); } else { - return std::make_unique( - std::move(filters), nullAllowed_, nanAllowed_); + return std::make_unique(std::move(filters), nullAllowed_); } } bool MultiRange::testDouble(double value) const { - if (std::isnan(value)) { - return nanAllowed_; - } for (const auto& filter : filters_) { if (filter->testDouble(value)) { return true; @@ -1459,9 +1452,6 @@ bool MultiRange::testDouble(double value) const { } bool MultiRange::testFloat(float value) const { - if (std::isnan(value)) { - return nanAllowed_; - } for (const auto& filter : filters_) { if (filter->testFloat(value)) { return true; @@ -1554,7 +1544,6 @@ std::unique_ptr MultiRange::mergeWith(const Filter* other) const { case FilterKind::kBytesRange: case FilterKind::kMultiRange: { bool bothNullAllowed = nullAllowed_ && other->testNull(); - bool bothNanAllowed = nanAllowed_; std::vector otherFilters; if (other->kind() == FilterKind::kMultiRange) { @@ -1562,7 +1551,6 @@ std::unique_ptr MultiRange::mergeWith(const Filter* other) const { for (auto const& filterOther : multiRangeOther->filters()) { otherFilters.emplace_back(filterOther.get()); } - bothNanAllowed = bothNanAllowed && multiRangeOther->nanAllowed(); } else { otherFilters.emplace_back(other); } @@ -1614,8 +1602,7 @@ std::unique_ptr MultiRange::mergeWith(const Filter* other) const { } else if (merged.size() == 1) { return merged.front()->clone(bothNullAllowed); } else { - return std::make_unique( - std::move(merged), bothNullAllowed, bothNanAllowed); + return std::make_unique(std::move(merged), bothNullAllowed); } } default: @@ -2637,8 +2624,7 @@ std::unique_ptr NegatedBytesValues::mergeWith( bytesRangeOther->upperUnbounded(), hiExclusive, false)); - return std::make_unique( - std::move(ranges), bothNullAllowed, false); + return std::make_unique(std::move(ranges), bothNullAllowed); } default: VELOX_UNREACHABLE(); diff --git a/velox/type/Filter.h b/velox/type/Filter.h index c60fb98d9df4e..4a84992f71d04 100644 --- a/velox/type/Filter.h +++ b/velox/type/Filter.h @@ -2042,16 +2042,14 @@ class MultiRange final : public Filter { /// All entries must support the same data types. /// @param nullAllowed Null values are passing the filter if true. nullAllowed /// flags in the 'ranges' filters are ignored. - /// @param nanAllowed Not-a-Number floating point values are passing the - /// filter if true. Applies to floating point data types only. NaN values are - /// not further tested using contained filters. + /// TODO: remove redundant param `nanAllowed` after presto removes the use of + /// this param. For now, we set a default value to avoid breaking presto. MultiRange( std::vector> filters, bool nullAllowed, - bool nanAllowed) + bool nanAllowed = false) : Filter(true, nullAllowed, FilterKind::kMultiRange), - filters_(std::move(filters)), - nanAllowed_(nanAllowed) {} + filters_(std::move(filters)) {} folly::dynamic serialize() const override; @@ -2083,15 +2081,10 @@ class MultiRange final : public Filter { std::unique_ptr mergeWith(const Filter* other) const override final; - bool nanAllowed() const { - return nanAllowed_; - } - bool testingEquals(const Filter& other) const final; private: const std::vector> filters_; - const bool nanAllowed_; }; // Helper for applying filters to different types diff --git a/velox/type/tests/FilterSerDeTest.cpp b/velox/type/tests/FilterSerDeTest.cpp index 03bd2dd363a00..0b4c52c1a9021 100644 --- a/velox/type/tests/FilterSerDeTest.cpp +++ b/velox/type/tests/FilterSerDeTest.cpp @@ -152,7 +152,7 @@ TEST_F(FilterSerDeTest, multiFilter) { filters.emplace_back(std::make_unique( "ABCD", true, true, "FFFF", false, true, false)); - MultiRange multiRange(std::move(filters), true, true); + MultiRange multiRange(std::move(filters), true); testSerde(multiRange); } diff --git a/velox/type/tests/FilterTest.cpp b/velox/type/tests/FilterTest.cpp index b0e7ed5f01625..339b678adba3d 100644 --- a/velox/type/tests/FilterTest.cpp +++ b/velox/type/tests/FilterTest.cpp @@ -1050,7 +1050,7 @@ TEST(FilterTest, multiRange) { EXPECT_TRUE(filter->testDouble(1.3)); EXPECT_FALSE(filter->testNull()); - EXPECT_FALSE(filter->testDouble(std::nan("nan"))); + EXPECT_TRUE(filter->testDouble(std::nan("nan"))); EXPECT_FALSE(filter->testDouble(1.2)); filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2)); @@ -1059,7 +1059,7 @@ TEST(FilterTest, multiRange) { EXPECT_TRUE(filter->testFloat(1.1f)); EXPECT_FALSE(filter->testFloat(1.2f)); EXPECT_TRUE(filter->testFloat(1.3f)); - EXPECT_FALSE(filter->testFloat(std::nanf("nan"))); + EXPECT_TRUE(filter->testFloat(std::nanf("nan"))); // != '' filter = orFilter(lessThan(""), greaterThan("")); @@ -1069,51 +1069,39 @@ TEST(FilterTest, multiRange) { TEST(FilterTest, multiRangeWithNaNs) { // x <> 1.2 with nanAllowed true - auto filter = - orFilter(lessThanFloat(1.2), greaterThanFloat(1.2), false, true); + auto filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2), false); EXPECT_TRUE(filter->testFloat(std::nanf("nan"))); EXPECT_FALSE(filter->testFloat(1.2f)); EXPECT_TRUE(filter->testFloat(1.1f)); - filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2), false, true); + filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2), false); EXPECT_TRUE(filter->testDouble(std::nan("nan"))); EXPECT_FALSE(filter->testDouble(1.2)); EXPECT_TRUE(filter->testDouble(1.1)); // x <> 1.2 with nanAllowed false filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2)); - EXPECT_FALSE(filter->testFloat(std::nanf("nan"))); + EXPECT_TRUE(filter->testFloat(std::nanf("nan"))); EXPECT_TRUE(filter->testFloat(1.0f)); filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2)); - EXPECT_FALSE(filter->testDouble(std::nan("nan"))); + EXPECT_TRUE(filter->testDouble(std::nan("nan"))); EXPECT_TRUE(filter->testDouble(1.4)); // x NOT IN (1.2, 1.3) with nanAllowed true - filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.3), false, true); + filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.3), false); EXPECT_TRUE(filter->testFloat(std::nanf("nan"))); EXPECT_FALSE(filter->testFloat(1.2f)); EXPECT_FALSE(filter->testFloat(1.3f)); EXPECT_TRUE(filter->testFloat(1.4f)); EXPECT_TRUE(filter->testFloat(1.1f)); - filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.3), false, true); + filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.3), false); EXPECT_TRUE(filter->testDouble(std::nan("nan"))); EXPECT_FALSE(filter->testDouble(1.2)); EXPECT_FALSE(filter->testDouble(1.3)); EXPECT_TRUE(filter->testDouble(1.4)); EXPECT_TRUE(filter->testDouble(1.1)); - - // x NOT IN (1.2) with nanAllowed false - filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2)); - EXPECT_FALSE(filter->testFloat(std::nanf("nan"))); - EXPECT_FALSE(filter->testFloat(1.2f)); - EXPECT_TRUE(filter->testFloat(1.3f)); - - filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2)); - EXPECT_FALSE(filter->testDouble(std::nan("nan"))); - EXPECT_FALSE(filter->testDouble(1.2)); - EXPECT_TRUE(filter->testDouble(1.3)); } TEST(FilterTest, createBigintValues) { diff --git a/velox/type/tests/NegatedBytesRangeBenchmark.cpp b/velox/type/tests/NegatedBytesRangeBenchmark.cpp index 3ed0b3cef249f..8459e6db74e6c 100644 --- a/velox/type/tests/NegatedBytesRangeBenchmark.cpp +++ b/velox/type/tests/NegatedBytesRangeBenchmark.cpp @@ -169,8 +169,8 @@ int32_t main(int32_t argc, char** argv) { "", true, false, lo, false, true, false)); rangeFilters.emplace_back(std::make_unique( hi, false, false, "", true, false, false)); - multiRanges.emplace_back(std::make_unique( - std::move(rangeFilters), false, false)); + multiRanges.emplace_back( + std::make_unique(std::move(rangeFilters), false)); LOG(INFO) << "Generated filter for length " << len << " with percentage " << pct; diff --git a/velox/type/tests/NegatedBytesValuesBenchmark.cpp b/velox/type/tests/NegatedBytesValuesBenchmark.cpp index cac1d449f904c..78da7f714dca5 100644 --- a/velox/type/tests/NegatedBytesValuesBenchmark.cpp +++ b/velox/type/tests/NegatedBytesValuesBenchmark.cpp @@ -178,7 +178,7 @@ int32_t main(int32_t argc, char* argv[]) { range_filters.emplace_back(std::make_unique( *back, false, true, "", true, true, false)); multi_ranges.emplace_back(std::make_unique( - std::move(range_filters), false, false)); + std::move(range_filters), false)); LOG(INFO) << "Generated filter for length " << len << " with size " << size;