From d34e49b5b917eee0a9c5e7cdf164dfae0452de28 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Thu, 3 Jun 2021 11:32:26 +0300 Subject: [PATCH 1/2] Added support of hasAny function to bloom_filter index. Also fixed bug discovered by AST fuzzer. (cherry-pciked 3ef5a0c3fb from Enmk/ClickHouse) --- .../MergeTreeIndexConditionBloomFilter.cpp | 48 +++++++++++++++---- .../MergeTreeIndexConditionBloomFilter.h | 1 + .../01888_bloom_filter_hasAny.reference | 0 .../0_stateless/01888_bloom_filter_hasAny.sql | 30 ++++++++++++ 4 files changed, 70 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/01888_bloom_filter_hasAny.reference create mode 100644 tests/queries/0_stateless/01888_bloom_filter_hasAny.sql diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp index 97c91527d27e..60345aef3449 100644 --- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp @@ -105,11 +105,12 @@ bool MergeTreeIndexConditionBloomFilter::alwaysUnknownOrTrue() const rpn_stack.push_back(true); } else if (element.function == RPNElement::FUNCTION_EQUALS - || element.function == RPNElement::FUNCTION_NOT_EQUALS - || element.function == RPNElement::FUNCTION_HAS - || element.function == RPNElement::FUNCTION_IN - || element.function == RPNElement::FUNCTION_NOT_IN - || element.function == RPNElement::ALWAYS_FALSE) + || element.function == RPNElement::FUNCTION_NOT_EQUALS + || element.function == RPNElement::FUNCTION_HAS + || element.function == RPNElement::FUNCTION_HAS_ANY + || element.function == RPNElement::FUNCTION_IN + || element.function == RPNElement::FUNCTION_NOT_IN + || element.function == RPNElement::ALWAYS_FALSE) { rpn_stack.push_back(false); } @@ -153,7 +154,8 @@ bool MergeTreeIndexConditionBloomFilter::mayBeTrueOnGranule(const MergeTreeIndex || element.function == RPNElement::FUNCTION_NOT_IN || element.function == RPNElement::FUNCTION_EQUALS || element.function == RPNElement::FUNCTION_NOT_EQUALS - || element.function == RPNElement::FUNCTION_HAS) + || element.function == RPNElement::FUNCTION_HAS + || element.function == RPNElement::FUNCTION_HAS_ANY) { bool match_rows = true; const auto & predicate = element.predicate; @@ -216,7 +218,7 @@ bool MergeTreeIndexConditionBloomFilter::traverseAtomAST(const ASTPtr & node, Bl const_value.getType() == Field::Types::Float64) { /// Zero in all types is represented in memory the same way as in UInt64. - out.function = const_value.get() ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE; + out.function = const_value.reinterpret() ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE; return true; } } @@ -234,7 +236,7 @@ bool MergeTreeIndexConditionBloomFilter::traverseAtomAST(const ASTPtr & node, Bl if (const auto & prepared_set = getPreparedSet(arguments[1])) return traverseASTIn(function->name, arguments[0], prepared_set, out); } - else if (function->name == "equals" || function->name == "notEquals" || function->name == "has") + else if (function->name == "equals" || function->name == "notEquals" || function->name == "has" || function->name == "hasAny") { Field const_value; DataTypePtr const_type; @@ -322,10 +324,38 @@ bool MergeTreeIndexConditionBloomFilter::traverseASTEquals( Field converted_field = convertFieldToType(value_field, *actual_type, value_type.get()); out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), converted_field))); } + else if (function_name == "hasAny") + { + if (!array_type) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be an array.", function_name); + + if (value_field.getType() != Field::Types::Array) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument for function {} must be an array.", function_name); + + const DataTypePtr actual_type = BloomFilter::getPrimitiveType(array_type->getNestedType()); + ColumnPtr column; + { + const auto is_nullable = actual_type->isNullable(); + auto mutable_column = actual_type->createColumn(); + + for (const auto & f : value_field.get()) + { + if (f.isNull() && !is_nullable) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument for function {} can't contain NULLs.", function_name); + + mutable_column->insert(convertFieldToType(f, *actual_type, value_type.get())); + } + + column = std::move(mutable_column); + } + + out.function = RPNElement::FUNCTION_HAS_ANY; + out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithColumn(actual_type, column, 0, column->size()))); + } else { if (array_type) - throw Exception("An array type of bloom_filter supports only has() function.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception("An array type of bloom_filter supports only has() and hasAny() functions.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); out.function = function_name == "equals" ? RPNElement::FUNCTION_EQUALS : RPNElement::FUNCTION_NOT_EQUALS; const DataTypePtr actual_type = BloomFilter::getPrimitiveType(index_type); diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h index 41b7416253f8..f7a288b44644 100644 --- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h +++ b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h @@ -24,6 +24,7 @@ class MergeTreeIndexConditionBloomFilter : public IMergeTreeIndexCondition FUNCTION_EQUALS, FUNCTION_NOT_EQUALS, FUNCTION_HAS, + FUNCTION_HAS_ANY, FUNCTION_IN, FUNCTION_NOT_IN, FUNCTION_UNKNOWN, /// Can take any value. diff --git a/tests/queries/0_stateless/01888_bloom_filter_hasAny.reference b/tests/queries/0_stateless/01888_bloom_filter_hasAny.reference new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/queries/0_stateless/01888_bloom_filter_hasAny.sql b/tests/queries/0_stateless/01888_bloom_filter_hasAny.sql new file mode 100644 index 000000000000..f7929cfa715c --- /dev/null +++ b/tests/queries/0_stateless/01888_bloom_filter_hasAny.sql @@ -0,0 +1,30 @@ +CREATE TABLE bftest ( + k Int64, + x Array(Int64), + index ix1(x) TYPE bloom_filter GRANULARITY 3 +) +Engine=MergeTree +ORDER BY k; + +INSERT INTO bftest SELECT number, arrayMap(i->rand64()%565656, range(10)) FROM numbers(1000); + +SET force_data_skipping_indices='ix1'; +SELECT count() FROM bftest WHERE has (x, 42) or has(x, -42) FORMAT Null; +SELECT count() FROM bftest WHERE hasAny(x, [42,-42]) FORMAT Null; +SELECT count() FROM bftest WHERE hasAny(x, []) FORMAT Null; +SELECT count() FROM bftest WHERE hasAny(x, [1]) FORMAT Null; + +-- can't use bloom_filter with `hasAny` on non-constant arguments (just like `has`) +SELECT count() FROM bftest WHERE hasAny(x, materialize([1,2,3])) FORMAT Null; -- { serverError 277 } + +-- NULLs are not Ok +SELECT count() FROM bftest WHERE hasAny(x, [NULL,-42]) FORMAT Null; -- { serverError 43 } +SELECT count() FROM bftest WHERE hasAny(x, [0,NULL]) FORMAT Null; -- { serverError 43 } + +-- non-compatible types +SELECT count() FROM bftest WHERE hasAny(x, [[123], -42]) FORMAT Null; -- { serverError 386 } +SELECT count() FROM bftest WHERE hasAny(x, [toDecimal32(123, 3), 2]) FORMAT Null; -- { serverError 53 } + +-- Bug discovered by AST fuzzier (shouldn't crash). +SELECT count() FROM bftest WHERE has(x, -0.) OR 0 FORMAT Null; +SELECT count() FROM bftest WHERE hasAny(x, [0, 1]) OR 0 FORMAT Null; From 22bf53483e12db96494b486c23ff1314b1f30e50 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Sat, 5 Jun 2021 00:08:36 +0300 Subject: [PATCH 2/2] Fixed the test Previous test version wasn't causing bug on unfixed code, this new one does. --- tests/queries/0_stateless/01888_bloom_filter_hasAny.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/01888_bloom_filter_hasAny.sql b/tests/queries/0_stateless/01888_bloom_filter_hasAny.sql index f7929cfa715c..35b2022c10dc 100644 --- a/tests/queries/0_stateless/01888_bloom_filter_hasAny.sql +++ b/tests/queries/0_stateless/01888_bloom_filter_hasAny.sql @@ -25,6 +25,6 @@ SELECT count() FROM bftest WHERE hasAny(x, [0,NULL]) FORMAT Null; -- { serverErr SELECT count() FROM bftest WHERE hasAny(x, [[123], -42]) FORMAT Null; -- { serverError 386 } SELECT count() FROM bftest WHERE hasAny(x, [toDecimal32(123, 3), 2]) FORMAT Null; -- { serverError 53 } --- Bug discovered by AST fuzzier (shouldn't crash). -SELECT count() FROM bftest WHERE has(x, -0.) OR 0 FORMAT Null; -SELECT count() FROM bftest WHERE hasAny(x, [0, 1]) OR 0 FORMAT Null; +-- Bug discovered by AST fuzzier (fixed, shouldn't crash). +SELECT 1 FROM bftest WHERE has(x, -0.) OR 0. FORMAT Null; +SELECT count() FROM bftest WHERE hasAny(x, [0, 1]) OR 0. FORMAT Null;