From 876919c45143af391da1ed46af2ff4774d999784 Mon Sep 17 00:00:00 2001 From: Jimmy Lu Date: Tue, 10 Sep 2024 08:46:35 -0700 Subject: [PATCH] Add check to forbid overlap ranges for ARRAY and MAP vectors Summary: We don't allow overlapping ranges in ARRAY and MAP vectors. However this is not clear in the code, so we add a method for user to check that, and also add the check to `BaseVector::validate()` method to make it clear it is not valid. Later we will also add the check to `ArrayVectorBase` constructor. Differential Revision: D62212238 --- velox/connectors/hive/HiveConnectorUtil.cpp | 2 +- .../hive/tests/HivePartitionFunctionTest.cpp | 34 +++------ velox/docs/develop/vectors.rst | 10 +-- velox/expression/tests/CastExprTest.cpp | 6 +- velox/expression/tests/ExprTest.cpp | 6 +- velox/functions/lib/tests/SliceTestBase.h | 2 +- .../prestosql/tests/MapEntriesTest.cpp | 12 ++-- velox/functions/prestosql/tests/ZipTest.cpp | 2 +- velox/vector/ComplexVector.cpp | 72 +++++++++++++------ velox/vector/ComplexVector.h | 12 +++- velox/vector/tests/VectorTest.cpp | 38 +++++++++- 11 files changed, 129 insertions(+), 67 deletions(-) diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index 236120e1763f2..ca197de5bba8d 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -28,7 +28,7 @@ #include "velox/dwio/dwrf/writer/Writer.h" #ifdef VELOX_ENABLE_PARQUET -#include "velox/dwio/parquet/writer/Writer.h" +#include "velox/dwio/parquet/writer/Writer.h" // @manual #endif #include "velox/expression/Expr.h" diff --git a/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp b/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp index 5884d1fc0fd42..eee2eb8139c49 100644 --- a/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp +++ b/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp @@ -319,23 +319,17 @@ TEST_F(HivePartitionFunctionTest, arrayElementsEncoded) { auto rawSizes = sizesBuffer->asMutable(); auto rawNulls = nullsBuffer->asMutable(); - // Make the elements overlap and have gaps. + // Make the elements have gaps. // Set the values in position 2 to be invalid since that Array should be null. std::vector offsets{ - 0, 2, std::numeric_limits().max(), 1, 8}; + 0, 2, std::numeric_limits().max(), 4, 8}; std::vector sizes{ - 4, 3, std::numeric_limits().max(), 5, 2}; + 2, 1, std::numeric_limits().max(), 3, 2}; memcpy(rawOffsets, offsets.data(), size * sizeof(vector_size_t)); memcpy(rawSizes, sizes.data(), size * sizeof(vector_size_t)); bits::setNull(rawNulls, 2); - // Produces arrays that look like: - // [9, NULL, 7, 6] - // [7, 6, 5] - // NULL - // [NULL, 7, 6, 5, NULL] - // [1, NULL] auto values = std::make_shared( pool_.get(), ARRAY(elements->type()), @@ -346,9 +340,9 @@ TEST_F(HivePartitionFunctionTest, arrayElementsEncoded) { encodedElements); assertPartitions(values, 1, {0, 0, 0, 0, 0}); - assertPartitions(values, 2, {0, 0, 0, 0, 1}); - assertPartitions(values, 500, {342, 418, 0, 458, 31}); - assertPartitions(values, 997, {149, 936, 0, 103, 31}); + assertPartitions(values, 2, {1, 1, 0, 0, 1}); + assertPartitions(values, 500, {279, 7, 0, 308, 31}); + assertPartitions(values, 997, {279, 7, 0, 820, 31}); assertPartitionsWithConstChannel(values, 1); assertPartitionsWithConstChannel(values, 2); @@ -428,23 +422,17 @@ TEST_F(HivePartitionFunctionTest, mapEntriesEncoded) { auto rawSizes = sizesBuffer->asMutable(); auto rawNulls = nullsBuffer->asMutable(); - // Make the elements overlap and have gaps. + // Make the elements have gaps. // Set the values in position 2 to be invalid since that Map should be null. std::vector offsets{ - 0, 2, std::numeric_limits().max(), 1, 8}; + 0, 2, std::numeric_limits().max(), 4, 8}; std::vector sizes{ - 4, 3, std::numeric_limits().max(), 5, 2}; + 2, 1, std::numeric_limits().max(), 3, 2}; memcpy(rawOffsets, offsets.data(), size * sizeof(vector_size_t)); memcpy(rawSizes, sizes.data(), size * sizeof(vector_size_t)); bits::setNull(rawNulls, 2); - // Produces maps that look like: - // {key_0: 9, key_3: NULL, key_6: 7, key_9: 6} - // {key_6: 7, key_9: 6, key_2: 5} - // NULL - // {key_3: NULL, key_6: 7, key_9: 6, key_2: 5, key_5: NULL} - // {key_4: 1, key_7: NULL} auto values = std::make_shared( pool_.get(), MAP(mapKeys->type(), mapValues->type()), @@ -457,8 +445,8 @@ TEST_F(HivePartitionFunctionTest, mapEntriesEncoded) { assertPartitions(values, 1, {0, 0, 0, 0, 0}); assertPartitions(values, 2, {0, 1, 0, 1, 0}); - assertPartitions(values, 500, {176, 259, 0, 91, 336}); - assertPartitions(values, 997, {694, 24, 0, 365, 345}); + assertPartitions(values, 500, {336, 413, 0, 259, 336}); + assertPartitions(values, 997, {345, 666, 0, 24, 345}); assertPartitionsWithConstChannel(values, 1); assertPartitionsWithConstChannel(values, 2); diff --git a/velox/docs/develop/vectors.rst b/velox/docs/develop/vectors.rst index 70515ec9f4304..1a3cb0b0b6a41 100644 --- a/velox/docs/develop/vectors.rst +++ b/velox/docs/develop/vectors.rst @@ -386,8 +386,9 @@ ArrayVector ~~~~~~~~~~~ ArrayVector stores values of type ARRAY. In addition to nulls buffer, it -contains offsets and sizes buffers and an elements vector. Offsets and sizes -are 32-bit integers. +contains offsets and sizes buffers and an elements vector. Offsets and sizes are +32-bit integers. The non-null non-empty ranges formed by offsets and sizes in a +vector is not allowed to overlap with each other. .. code-block:: c++ @@ -443,8 +444,9 @@ MapVector ~~~~~~~~~ MapVector stores values of type MAP. In addition to nulls buffer, it contains -offsets and sizes buffers, keys and values vectors. Offsets and sizes are -32-bit integers. +offsets and sizes buffers, keys and values vectors. Offsets and sizes are 32-bit +integers. The non-null non-empty ranges formed by offsets and sizes in a vector +is not allowed to overlap with each other. .. code-block:: c++ diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp index 3ad743e31d21c..67317cc1758e6 100644 --- a/velox/expression/tests/CastExprTest.cpp +++ b/velox/expression/tests/CastExprTest.cpp @@ -1279,7 +1279,8 @@ TEST_F(CastExprTest, mapCast) { SelectivityVector rows(5); rows.setValid(2, false); - mapVector->setOffsetAndSize(2, 100, 100); + mapVector->setOffsetAndSize(2, 100, 1); + mapVector->setOffsetAndSize(51, 2, 1); std::vector results(1); auto rowVector = makeRowVector({mapVector}); @@ -1369,7 +1370,8 @@ TEST_F(CastExprTest, arrayCast) { SelectivityVector rows(5); rows.setValid(2, false); - arrayVector->setOffsetAndSize(2, 100, 10); + arrayVector->setOffsetAndSize(2, 100, 5); + arrayVector->setOffsetAndSize(20, 10, 5); std::vector results(1); auto rowVector = makeRowVector({arrayVector}); diff --git a/velox/expression/tests/ExprTest.cpp b/velox/expression/tests/ExprTest.cpp index 100d0acb71b1b..b4390c74bcc29 100644 --- a/velox/expression/tests/ExprTest.cpp +++ b/velox/expression/tests/ExprTest.cpp @@ -3568,10 +3568,10 @@ TEST_P(ParameterizedExprTest, mapKeysAndValues) { MAP(BIGINT(), BIGINT()), makeNulls(vectorSize, nullEvery(3)), vectorSize, - makeIndices(vectorSize, [](auto /* row */) { return 0; }), + makeIndices(vectorSize, folly::identity), makeIndices(vectorSize, [](auto /* row */) { return 1; }), - makeFlatVector({1, 2, 3}), - makeFlatVector({10, 20, 30})); + makeFlatVector(vectorSize, folly::identity), + makeFlatVector(vectorSize, [](auto i) { return 10 * i; })); auto input = makeRowVector({mapVector}); auto exprSet = compileMultiple( {"map_keys(c0)", "map_values(c0)"}, asRowType(input->type())); diff --git a/velox/functions/lib/tests/SliceTestBase.h b/velox/functions/lib/tests/SliceTestBase.h index 8ea998ca888c2..bdb54550fbe68 100644 --- a/velox/functions/lib/tests/SliceTestBase.h +++ b/velox/functions/lib/tests/SliceTestBase.h @@ -35,7 +35,7 @@ class SliceTestBase : public FunctionBaseTest { const ArrayVectorPtr& expectedArrayVector) { auto result = evaluate(expression, makeRowVector(parameters)); ::facebook::velox::test::assertEqualVectors(expectedArrayVector, result); - EXPECT_NO_THROW(expectedArrayVector->checkRanges()); + EXPECT_FALSE(expectedArrayVector->hasOverlappingRanges()); } void basicTestCases() { diff --git a/velox/functions/prestosql/tests/MapEntriesTest.cpp b/velox/functions/prestosql/tests/MapEntriesTest.cpp index f94ae8b3e3ef9..a152a0a7e7007 100644 --- a/velox/functions/prestosql/tests/MapEntriesTest.cpp +++ b/velox/functions/prestosql/tests/MapEntriesTest.cpp @@ -168,14 +168,14 @@ TEST_F(MapEntriesTest, differentSizedValueKeyVectors) { makeNullableFlatVector({1, 2, 3, 4, std::nullopt, std::nullopt}); auto valueVector = makeFlatVector({1, 2, 3, 4}); - auto offsetBuffer = makeIndices({3, 2, 1, 0, 0, 0}); - auto sizeBuffer = makeIndices({1, 1, 1, 1, 1, 1}); + auto offsetBuffer = makeIndices({3, 2, 1, 0}); + auto sizeBuffer = makeIndices({1, 1, 1, 1}); auto mapVector = std::make_shared( pool(), MAP(BIGINT(), BIGINT()), nullptr, - 6, + 4, offsetBuffer, sizeBuffer, keyVector, @@ -188,9 +188,9 @@ TEST_F(MapEntriesTest, differentSizedValueKeyVectors) { EXPECT_NE(result, nullptr); auto elementVector = makeRowVector( - {makeFlatVector({4, 3, 2, 1, 1, 1}), - makeFlatVector({4, 3, 2, 1, 1, 1})}); - auto arrayVector = makeArrayVector({0, 1, 2, 3, 4, 5}, elementVector); + {makeFlatVector({4, 3, 2, 1}), + makeFlatVector({4, 3, 2, 1})}); + auto arrayVector = makeArrayVector({0, 1, 2, 3}, elementVector); test::assertEqualVectors(arrayVector, result); } diff --git a/velox/functions/prestosql/tests/ZipTest.cpp b/velox/functions/prestosql/tests/ZipTest.cpp index 9ed34c9df322e..a74f951205386 100644 --- a/velox/functions/prestosql/tests/ZipTest.cpp +++ b/velox/functions/prestosql/tests/ZipTest.cpp @@ -326,7 +326,7 @@ TEST_F(ZipTest, flatArraysWithDifferentOffsets) { /// Test if we can zip two flat vectors of arrays with overlapping ranges of /// elements. -TEST_F(ZipTest, flatArraysWithOverlappingRanges) { +TEST_F(ZipTest, DISABLED_flatArraysWithOverlappingRanges) { const auto size = 3; BufferPtr offsetsBuffer = allocateOffsets(size, pool_.get()); BufferPtr sizesBuffer = allocateSizes(size, pool_.get()); diff --git a/velox/vector/ComplexVector.cpp b/velox/vector/ComplexVector.cpp index f13a9d602a753..a49fe5886e901 100644 --- a/velox/vector/ComplexVector.cpp +++ b/velox/vector/ComplexVector.cpp @@ -809,31 +809,58 @@ VectorPtr RowVector::pushDictionaryToRowVectorLeaves(const VectorPtr& input) { wrappers, input->size(), input, input->pool()); } -void ArrayVectorBase::checkRanges() const { - std::unordered_map seenElements; - seenElements.reserve(size()); +template +vector_size_t ArrayVectorBase::nextNonEmpty(vector_size_t i) const { + while (i < size() && + ((kHasNulls && bits::isBitNull(rawNulls(), i)) || rawSizes_[i] <= 0)) { + ++i; + } + return i; +} + +template +bool ArrayVectorBase::maybeHaveOverlappingRanges() const { + vector_size_t i = 0; + i = nextNonEmpty(i); + if (i >= size()) { + return false; + } + for (;;) { + auto j = nextNonEmpty(i + 1); + if (j >= size()) { + return false; + } + if (rawOffsets_[i] + rawSizes_[i] > rawOffsets_[j]) { + return true; + } + i = j; + } +} +bool ArrayVectorBase::hasOverlappingRanges() const { + if (!(rawNulls() ? maybeHaveOverlappingRanges() + : maybeHaveOverlappingRanges())) { + return false; + } + std::vector indices; + indices.reserve(size()); for (vector_size_t i = 0; i < size(); ++i) { - auto size = sizeAt(i); - auto offset = offsetAt(i); - - for (vector_size_t j = 0; j < size; ++j) { - auto it = seenElements.find(offset + j); - if (it != seenElements.end()) { - VELOX_FAIL( - "checkRanges() found overlap at idx {}: element {} has offset {} " - "and size {}, and element {} has offset {} and size {}.", - offset + j, - it->second, - offsetAt(it->second), - sizeAt(it->second), - i, - offset, - size); - } - seenElements.emplace(offset + j, i); + bool isNull = rawNulls() && bits::isBitNull(rawNulls(), i); + if (!isNull && rawSizes_[i] > 0) { + indices.push_back(i); + } + } + std::sort(indices.begin(), indices.end(), [&](auto i, auto j) { + return rawOffsets_[i] < rawOffsets_[j]; + }); + for (vector_size_t i = 1; i < indices.size(); ++i) { + auto j = indices[i - 1]; + auto k = indices[i]; + if (rawOffsets_[j] + rawSizes_[j] > rawOffsets_[k]) { + return true; } } + return false; } void ArrayVectorBase::validateArrayVectorBase( @@ -868,6 +895,9 @@ void ArrayVectorBase::validateArrayVectorBase( "vector's size. Index: {}.", i); } + VELOX_CHECK( + !hasOverlappingRanges(), + "No overlapping ranges of offsets and sizes are allowed in ARRAY or MAP vectors"); } namespace { diff --git a/velox/vector/ComplexVector.h b/velox/vector/ComplexVector.h index dcda5d6842694..567aba70a0891 100644 --- a/velox/vector/ComplexVector.h +++ b/velox/vector/ComplexVector.h @@ -364,9 +364,8 @@ struct ArrayVectorBase : BaseVector { sizes_->asMutable()[i] = size; } - /// Verify that an ArrayVector/MapVector does not contain overlapping [offset, - /// size] ranges. Throws in case overlaps are found. - void checkRanges() const; + /// Check if there is any overlapping [offset, size] ranges. + bool hasOverlappingRanges() const; protected: ArrayVectorBase( @@ -411,6 +410,13 @@ struct ArrayVectorBase : BaseVector { const vector_size_t* rawOffsets_; BufferPtr sizes_; const vector_size_t* rawSizes_; + + private: + template + bool maybeHaveOverlappingRanges() const; + + template + vector_size_t nextNonEmpty(vector_size_t i) const; }; class ArrayVector : public ArrayVectorBase { diff --git a/velox/vector/tests/VectorTest.cpp b/velox/vector/tests/VectorTest.cpp index 88cbe5b842c05..3a5525412b025 100644 --- a/velox/vector/tests/VectorTest.cpp +++ b/velox/vector/tests/VectorTest.cpp @@ -2882,7 +2882,7 @@ TEST_F(VectorTest, resizeArrayAndMapResetOffsets) { // Test array. { - auto offsets = makeIndices({1, 1, 1, 1}); + auto offsets = makeIndices({0, 1, 2, 3}); auto sizes = makeIndices({1, 1, 1, 1}); auto* rawSizes = sizes->as(); @@ -2907,7 +2907,7 @@ TEST_F(VectorTest, resizeArrayAndMapResetOffsets) { // Test map. { - auto offsets = makeIndices({1, 1, 1, 1}); + auto offsets = makeIndices({0, 1, 2, 3}); auto sizes = makeIndices({1, 1, 1, 1}); auto* rawSizes = sizes->as(); @@ -3899,5 +3899,39 @@ TEST_F(VectorTest, testOverSizedArray) { EXPECT_THROW(BaseVector::flattenVector(constArray), VeloxUserError); } +TEST_F(VectorTest, hasOverlappingRanges) { + auto test = [this]( + vector_size_t length, + const std::vector& nulls, + const std::vector& offsets, + const std::vector& sizes, + bool overlap) { + auto makeArray = [&] { + return std::make_shared( + pool(), + ARRAY(BIGINT()), + makeNulls(nulls), + length, + makeIndices(length, [&](auto i) { return offsets[i]; }), + makeIndices(length, [&](auto i) { return sizes[i]; }), + makeNullConstant( + TypeKind::BIGINT, std::numeric_limits::max())); + }; + if (!overlap) { + ASSERT_FALSE(makeArray()->hasOverlappingRanges()); + } else { + ASSERT_TRUE(makeArray()->hasOverlappingRanges()); + } + }; + test(3, {false, false, false}, {0, 1, 2}, {1, 1, 1}, false); + test(3, {false, true, false}, {0, 0, 2}, {1, 1, 1}, false); + test(3, {false, false, false}, {0, 0, 2}, {1, 0, 1}, false); + test(3, {false, false, false}, {0, 0, 2}, {1, 1, 1}, true); + test(3, {false, false, false}, {2, 1, 0}, {1, 1, 1}, false); + test(3, {false, false, false}, {2, 1, 0}, {1, 2, 1}, true); + test(2, {false, false}, {0, 1}, {3, 1}, true); + test(2, {false, false}, {1, 0}, {1, 3}, true); +} + } // namespace } // namespace facebook::velox