From f676360b0c39d42bed4c97ca186f3ccb6d41b4fd Mon Sep 17 00:00:00 2001 From: Bikramjeet Vig Date: Wed, 24 Jul 2024 11:48:14 -0700 Subject: [PATCH] Fix MultiRange filter to correctly handle NaN input (#10535) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/10535 The current MultiRange filter includes an extra flag specifically for allowing or filtering out NaN values, independent of its internal filters. This responsibility should instead be managed by the filters themselves. This change therefore removes that additional handling to ensure consistency between using a single filter VS using multiple. Reviewed By: Yuhta Differential Revision: D60138398 --- velox/connectors/hive/HiveConnectorUtil.cpp | 2 +- velox/expression/ExprToSubfieldFilter.cpp | 3 +- velox/expression/ExprToSubfieldFilter.h | 10 ++----- velox/type/Filter.cpp | 28 +++++-------------- velox/type/Filter.h | 15 +++------- velox/type/tests/FilterSerDeTest.cpp | 2 +- velox/type/tests/FilterTest.cpp | 28 ++++++------------- .../type/tests/NegatedBytesRangeBenchmark.cpp | 4 +-- .../tests/NegatedBytesValuesBenchmark.cpp | 2 +- 9 files changed, 28 insertions(+), 66 deletions(-) diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index 7a3004db1a38..1322c85e3771 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -91,7 +91,7 @@ std::unique_ptr makeFloatingPointMapKeyFilter( if (filters.size() == 1) { return std::move(filters[0]); } - return std::make_unique(std::move(filters), false, false); + return std::make_unique(std::move(filters), false); } // Recursively add subfields to scan spec. diff --git a/velox/expression/ExprToSubfieldFilter.cpp b/velox/expression/ExprToSubfieldFilter.cpp index 8760dda6e693..f876c5de3fa8 100644 --- a/velox/expression/ExprToSubfieldFilter.cpp +++ b/velox/expression/ExprToSubfieldFilter.cpp @@ -336,8 +336,7 @@ std::unique_ptr makeNotEqualFilter( std::vector> filters; filters.emplace_back(std::move(lessThanFilter)); filters.emplace_back(std::move(greaterThanFilter)); - return std::make_unique( - std::move(filters), false, false); + return std::make_unique(std::move(filters), false); } } diff --git a/velox/expression/ExprToSubfieldFilter.h b/velox/expression/ExprToSubfieldFilter.h index 193e3413ea25..f87d28e2530b 100644 --- a/velox/expression/ExprToSubfieldFilter.h +++ b/velox/expression/ExprToSubfieldFilter.h @@ -320,16 +320,12 @@ inline std::unique_ptr isNotNull() { } template -std::unique_ptr orFilter( - std::unique_ptr a, - std::unique_ptr b, - bool nullAllowed = false, - bool nanAllowed = false) { +std::unique_ptr +orFilter(std::unique_ptr a, std::unique_ptr b, bool nullAllowed = false) { std::vector> filters; filters.emplace_back(std::move(a)); filters.emplace_back(std::move(b)); - return std::make_unique( - std::move(filters), nullAllowed, nanAllowed); + return std::make_unique(std::move(filters), nullAllowed); } inline std::unique_ptr lessThanHugeint( diff --git a/velox/type/Filter.cpp b/velox/type/Filter.cpp index 07d64b3b0763..d0f6932b4aad 100644 --- a/velox/type/Filter.cpp +++ b/velox/type/Filter.cpp @@ -712,7 +712,6 @@ bool NegatedBytesValues::testingEquals(const Filter& other) const { folly::dynamic MultiRange::serialize() const { auto obj = Filter::serializeBase("MultiRange"); - obj["nanAllowed"] = nanAllowed_; folly::dynamic arr = folly::dynamic::array; for (const auto& f : filters_) { arr.push_back(f->serialize()); @@ -723,7 +722,6 @@ folly::dynamic MultiRange::serialize() const { FilterPtr MultiRange::create(const folly::dynamic& obj) { auto nullAllowed = deserializeNullAllowed(obj); - auto nanAllowed = obj["nanAllowed"].asBool(); folly::dynamic arr = obj["filters"]; auto tmpFilters = ISerializable::deserialize>(arr); @@ -733,14 +731,13 @@ FilterPtr MultiRange::create(const folly::dynamic& obj) { filters.emplace_back(f->clone()); } - return std::make_unique( - std::move(filters), nullAllowed, nanAllowed); + return std::make_unique(std::move(filters), nullAllowed); } bool MultiRange::testingEquals(const Filter& other) const { auto otherMultiRange = dynamic_cast(&other); auto res = otherMultiRange != nullptr && Filter::testingBaseEquals(other) && - nanAllowed_ == otherMultiRange->nanAllowed_ && + filters_.size() == otherMultiRange->filters_.size(); if (!res) { @@ -1357,7 +1354,7 @@ std::unique_ptr NegatedBytesRange::toMultiRange() const { if (accepted.size() == 1) { return accepted[0]->clone(nullAllowed_); } - return std::make_unique(std::move(accepted), nullAllowed_, false); + return std::make_unique(std::move(accepted), nullAllowed_); } bool NegatedBytesValues::testBytesRange( @@ -1439,17 +1436,13 @@ std::unique_ptr MultiRange::clone( if (nullAllowed) { return std::make_unique( - std::move(filters), nullAllowed.value(), nanAllowed_); + std::move(filters), nullAllowed.value()); } else { - return std::make_unique( - std::move(filters), nullAllowed_, nanAllowed_); + return std::make_unique(std::move(filters), nullAllowed_); } } bool MultiRange::testDouble(double value) const { - if (std::isnan(value)) { - return nanAllowed_; - } for (const auto& filter : filters_) { if (filter->testDouble(value)) { return true; @@ -1459,9 +1452,6 @@ bool MultiRange::testDouble(double value) const { } bool MultiRange::testFloat(float value) const { - if (std::isnan(value)) { - return nanAllowed_; - } for (const auto& filter : filters_) { if (filter->testFloat(value)) { return true; @@ -1554,7 +1544,6 @@ std::unique_ptr MultiRange::mergeWith(const Filter* other) const { case FilterKind::kBytesRange: case FilterKind::kMultiRange: { bool bothNullAllowed = nullAllowed_ && other->testNull(); - bool bothNanAllowed = nanAllowed_; std::vector otherFilters; if (other->kind() == FilterKind::kMultiRange) { @@ -1562,7 +1551,6 @@ std::unique_ptr MultiRange::mergeWith(const Filter* other) const { for (auto const& filterOther : multiRangeOther->filters()) { otherFilters.emplace_back(filterOther.get()); } - bothNanAllowed = bothNanAllowed && multiRangeOther->nanAllowed(); } else { otherFilters.emplace_back(other); } @@ -1614,8 +1602,7 @@ std::unique_ptr MultiRange::mergeWith(const Filter* other) const { } else if (merged.size() == 1) { return merged.front()->clone(bothNullAllowed); } else { - return std::make_unique( - std::move(merged), bothNullAllowed, bothNanAllowed); + return std::make_unique(std::move(merged), bothNullAllowed); } } default: @@ -2637,8 +2624,7 @@ std::unique_ptr NegatedBytesValues::mergeWith( bytesRangeOther->upperUnbounded(), hiExclusive, false)); - return std::make_unique( - std::move(ranges), bothNullAllowed, false); + return std::make_unique(std::move(ranges), bothNullAllowed); } default: VELOX_UNREACHABLE(); diff --git a/velox/type/Filter.h b/velox/type/Filter.h index c60fb98d9df4..4a84992f71d0 100644 --- a/velox/type/Filter.h +++ b/velox/type/Filter.h @@ -2042,16 +2042,14 @@ class MultiRange final : public Filter { /// All entries must support the same data types. /// @param nullAllowed Null values are passing the filter if true. nullAllowed /// flags in the 'ranges' filters are ignored. - /// @param nanAllowed Not-a-Number floating point values are passing the - /// filter if true. Applies to floating point data types only. NaN values are - /// not further tested using contained filters. + /// TODO: remove redundant param `nanAllowed` after presto removes the use of + /// this param. For now, we set a default value to avoid breaking presto. MultiRange( std::vector> filters, bool nullAllowed, - bool nanAllowed) + bool nanAllowed = false) : Filter(true, nullAllowed, FilterKind::kMultiRange), - filters_(std::move(filters)), - nanAllowed_(nanAllowed) {} + filters_(std::move(filters)) {} folly::dynamic serialize() const override; @@ -2083,15 +2081,10 @@ class MultiRange final : public Filter { std::unique_ptr mergeWith(const Filter* other) const override final; - bool nanAllowed() const { - return nanAllowed_; - } - bool testingEquals(const Filter& other) const final; private: const std::vector> filters_; - const bool nanAllowed_; }; // Helper for applying filters to different types diff --git a/velox/type/tests/FilterSerDeTest.cpp b/velox/type/tests/FilterSerDeTest.cpp index 03bd2dd363a0..0b4c52c1a902 100644 --- a/velox/type/tests/FilterSerDeTest.cpp +++ b/velox/type/tests/FilterSerDeTest.cpp @@ -152,7 +152,7 @@ TEST_F(FilterSerDeTest, multiFilter) { filters.emplace_back(std::make_unique( "ABCD", true, true, "FFFF", false, true, false)); - MultiRange multiRange(std::move(filters), true, true); + MultiRange multiRange(std::move(filters), true); testSerde(multiRange); } diff --git a/velox/type/tests/FilterTest.cpp b/velox/type/tests/FilterTest.cpp index b0e7ed5f0162..339b678adba3 100644 --- a/velox/type/tests/FilterTest.cpp +++ b/velox/type/tests/FilterTest.cpp @@ -1050,7 +1050,7 @@ TEST(FilterTest, multiRange) { EXPECT_TRUE(filter->testDouble(1.3)); EXPECT_FALSE(filter->testNull()); - EXPECT_FALSE(filter->testDouble(std::nan("nan"))); + EXPECT_TRUE(filter->testDouble(std::nan("nan"))); EXPECT_FALSE(filter->testDouble(1.2)); filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2)); @@ -1059,7 +1059,7 @@ TEST(FilterTest, multiRange) { EXPECT_TRUE(filter->testFloat(1.1f)); EXPECT_FALSE(filter->testFloat(1.2f)); EXPECT_TRUE(filter->testFloat(1.3f)); - EXPECT_FALSE(filter->testFloat(std::nanf("nan"))); + EXPECT_TRUE(filter->testFloat(std::nanf("nan"))); // != '' filter = orFilter(lessThan(""), greaterThan("")); @@ -1069,51 +1069,39 @@ TEST(FilterTest, multiRange) { TEST(FilterTest, multiRangeWithNaNs) { // x <> 1.2 with nanAllowed true - auto filter = - orFilter(lessThanFloat(1.2), greaterThanFloat(1.2), false, true); + auto filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2), false); EXPECT_TRUE(filter->testFloat(std::nanf("nan"))); EXPECT_FALSE(filter->testFloat(1.2f)); EXPECT_TRUE(filter->testFloat(1.1f)); - filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2), false, true); + filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2), false); EXPECT_TRUE(filter->testDouble(std::nan("nan"))); EXPECT_FALSE(filter->testDouble(1.2)); EXPECT_TRUE(filter->testDouble(1.1)); // x <> 1.2 with nanAllowed false filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2)); - EXPECT_FALSE(filter->testFloat(std::nanf("nan"))); + EXPECT_TRUE(filter->testFloat(std::nanf("nan"))); EXPECT_TRUE(filter->testFloat(1.0f)); filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2)); - EXPECT_FALSE(filter->testDouble(std::nan("nan"))); + EXPECT_TRUE(filter->testDouble(std::nan("nan"))); EXPECT_TRUE(filter->testDouble(1.4)); // x NOT IN (1.2, 1.3) with nanAllowed true - filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.3), false, true); + filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.3), false); EXPECT_TRUE(filter->testFloat(std::nanf("nan"))); EXPECT_FALSE(filter->testFloat(1.2f)); EXPECT_FALSE(filter->testFloat(1.3f)); EXPECT_TRUE(filter->testFloat(1.4f)); EXPECT_TRUE(filter->testFloat(1.1f)); - filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.3), false, true); + filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.3), false); EXPECT_TRUE(filter->testDouble(std::nan("nan"))); EXPECT_FALSE(filter->testDouble(1.2)); EXPECT_FALSE(filter->testDouble(1.3)); EXPECT_TRUE(filter->testDouble(1.4)); EXPECT_TRUE(filter->testDouble(1.1)); - - // x NOT IN (1.2) with nanAllowed false - filter = orFilter(lessThanFloat(1.2), greaterThanFloat(1.2)); - EXPECT_FALSE(filter->testFloat(std::nanf("nan"))); - EXPECT_FALSE(filter->testFloat(1.2f)); - EXPECT_TRUE(filter->testFloat(1.3f)); - - filter = orFilter(lessThanDouble(1.2), greaterThanDouble(1.2)); - EXPECT_FALSE(filter->testDouble(std::nan("nan"))); - EXPECT_FALSE(filter->testDouble(1.2)); - EXPECT_TRUE(filter->testDouble(1.3)); } TEST(FilterTest, createBigintValues) { diff --git a/velox/type/tests/NegatedBytesRangeBenchmark.cpp b/velox/type/tests/NegatedBytesRangeBenchmark.cpp index 3ed0b3cef249..8459e6db74e6 100644 --- a/velox/type/tests/NegatedBytesRangeBenchmark.cpp +++ b/velox/type/tests/NegatedBytesRangeBenchmark.cpp @@ -169,8 +169,8 @@ int32_t main(int32_t argc, char** argv) { "", true, false, lo, false, true, false)); rangeFilters.emplace_back(std::make_unique( hi, false, false, "", true, false, false)); - multiRanges.emplace_back(std::make_unique( - std::move(rangeFilters), false, false)); + multiRanges.emplace_back( + std::make_unique(std::move(rangeFilters), false)); LOG(INFO) << "Generated filter for length " << len << " with percentage " << pct; diff --git a/velox/type/tests/NegatedBytesValuesBenchmark.cpp b/velox/type/tests/NegatedBytesValuesBenchmark.cpp index cac1d449f904..78da7f714dca 100644 --- a/velox/type/tests/NegatedBytesValuesBenchmark.cpp +++ b/velox/type/tests/NegatedBytesValuesBenchmark.cpp @@ -178,7 +178,7 @@ int32_t main(int32_t argc, char* argv[]) { range_filters.emplace_back(std::make_unique( *back, false, true, "", true, true, false)); multi_ranges.emplace_back(std::make_unique( - std::move(range_filters), false, false)); + std::move(range_filters), false)); LOG(INFO) << "Generated filter for length " << len << " with size " << size;