Skip to content

Commit

Permalink
Add check for overlapping ranges of ARRAY and MAP vectors (facebookin…
Browse files Browse the repository at this point in the history
…cubator#10960)

Summary:
Pull Request resolved: facebookincubator#10960

We don't allow overlapping ranges in ARRAY and MAP vectors.  However this is not clear in the code, so we add a method for user to check that.

In the future we will also add the check to `BaseVector::validate()` and `ArrayVectorBase` constructor to make it clear it is not allowed.

Reviewed By: xiaoxmeng

Differential Revision: D62212238
  • Loading branch information
Yuhta authored and facebook-github-bot committed Sep 16, 2024
1 parent c167381 commit 888c044
Show file tree
Hide file tree
Showing 10 changed files with 129 additions and 68 deletions.
34 changes: 11 additions & 23 deletions velox/connectors/hive/tests/HivePartitionFunctionTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,23 +319,17 @@ TEST_F(HivePartitionFunctionTest, arrayElementsEncoded) {
auto rawSizes = sizesBuffer->asMutable<vector_size_t>();
auto rawNulls = nullsBuffer->asMutable<uint64_t>();

// Make the elements overlap and have gaps.
// Make the elements have gaps.
// Set the values in position 2 to be invalid since that Array should be null.
std::vector<vector_size_t> offsets{
0, 2, std::numeric_limits<int32_t>().max(), 1, 8};
0, 2, std::numeric_limits<int32_t>().max(), 4, 8};
std::vector<vector_size_t> sizes{
4, 3, std::numeric_limits<int32_t>().max(), 5, 2};
2, 1, std::numeric_limits<int32_t>().max(), 3, 2};
memcpy(rawOffsets, offsets.data(), size * sizeof(vector_size_t));
memcpy(rawSizes, sizes.data(), size * sizeof(vector_size_t));

bits::setNull(rawNulls, 2);

// Produces arrays that look like:
// [9, NULL, 7, 6]
// [7, 6, 5]
// NULL
// [NULL, 7, 6, 5, NULL]
// [1, NULL]
auto values = std::make_shared<ArrayVector>(
pool_.get(),
ARRAY(elements->type()),
Expand All @@ -346,9 +340,9 @@ TEST_F(HivePartitionFunctionTest, arrayElementsEncoded) {
encodedElements);

assertPartitions(values, 1, {0, 0, 0, 0, 0});
assertPartitions(values, 2, {0, 0, 0, 0, 1});
assertPartitions(values, 500, {342, 418, 0, 458, 31});
assertPartitions(values, 997, {149, 936, 0, 103, 31});
assertPartitions(values, 2, {1, 1, 0, 0, 1});
assertPartitions(values, 500, {279, 7, 0, 308, 31});
assertPartitions(values, 997, {279, 7, 0, 820, 31});

assertPartitionsWithConstChannel(values, 1);
assertPartitionsWithConstChannel(values, 2);
Expand Down Expand Up @@ -428,23 +422,17 @@ TEST_F(HivePartitionFunctionTest, mapEntriesEncoded) {
auto rawSizes = sizesBuffer->asMutable<vector_size_t>();
auto rawNulls = nullsBuffer->asMutable<uint64_t>();

// Make the elements overlap and have gaps.
// Make the elements have gaps.
// Set the values in position 2 to be invalid since that Map should be null.
std::vector<vector_size_t> offsets{
0, 2, std::numeric_limits<int32_t>().max(), 1, 8};
0, 2, std::numeric_limits<int32_t>().max(), 4, 8};
std::vector<vector_size_t> sizes{
4, 3, std::numeric_limits<int32_t>().max(), 5, 2};
2, 1, std::numeric_limits<int32_t>().max(), 3, 2};
memcpy(rawOffsets, offsets.data(), size * sizeof(vector_size_t));
memcpy(rawSizes, sizes.data(), size * sizeof(vector_size_t));

bits::setNull(rawNulls, 2);

// Produces maps that look like:
// {key_0: 9, key_3: NULL, key_6: 7, key_9: 6}
// {key_6: 7, key_9: 6, key_2: 5}
// NULL
// {key_3: NULL, key_6: 7, key_9: 6, key_2: 5, key_5: NULL}
// {key_4: 1, key_7: NULL}
auto values = std::make_shared<MapVector>(
pool_.get(),
MAP(mapKeys->type(), mapValues->type()),
Expand All @@ -457,8 +445,8 @@ TEST_F(HivePartitionFunctionTest, mapEntriesEncoded) {

assertPartitions(values, 1, {0, 0, 0, 0, 0});
assertPartitions(values, 2, {0, 1, 0, 1, 0});
assertPartitions(values, 500, {176, 259, 0, 91, 336});
assertPartitions(values, 997, {694, 24, 0, 365, 345});
assertPartitions(values, 500, {336, 413, 0, 259, 336});
assertPartitions(values, 997, {345, 666, 0, 24, 345});

assertPartitionsWithConstChannel(values, 1);
assertPartitionsWithConstChannel(values, 2);
Expand Down
10 changes: 6 additions & 4 deletions velox/docs/develop/vectors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -386,8 +386,9 @@ ArrayVector
~~~~~~~~~~~

ArrayVector stores values of type ARRAY. In addition to nulls buffer, it
contains offsets and sizes buffers and an elements vector. Offsets and sizes
are 32-bit integers.
contains offsets and sizes buffers and an elements vector. Offsets and sizes are
32-bit integers. The non-null non-empty ranges formed by offsets and sizes in a
vector is not allowed to overlap with each other.

.. code-block:: c++

Expand Down Expand Up @@ -443,8 +444,9 @@ MapVector
~~~~~~~~~

MapVector stores values of type MAP. In addition to nulls buffer, it contains
offsets and sizes buffers, keys and values vectors. Offsets and sizes are
32-bit integers.
offsets and sizes buffers, keys and values vectors. Offsets and sizes are 32-bit
integers. The non-null non-empty ranges formed by offsets and sizes in a vector
is not allowed to overlap with each other.

.. code-block:: c++

Expand Down
6 changes: 4 additions & 2 deletions velox/expression/tests/CastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1279,7 +1279,8 @@ TEST_F(CastExprTest, mapCast) {

SelectivityVector rows(5);
rows.setValid(2, false);
mapVector->setOffsetAndSize(2, 100, 100);
mapVector->setOffsetAndSize(2, 100, 1);
mapVector->setOffsetAndSize(51, 2, 1);
std::vector<VectorPtr> results(1);

auto rowVector = makeRowVector({mapVector});
Expand Down Expand Up @@ -1369,7 +1370,8 @@ TEST_F(CastExprTest, arrayCast) {

SelectivityVector rows(5);
rows.setValid(2, false);
arrayVector->setOffsetAndSize(2, 100, 10);
arrayVector->setOffsetAndSize(2, 100, 5);
arrayVector->setOffsetAndSize(20, 10, 5);
std::vector<VectorPtr> results(1);

auto rowVector = makeRowVector({arrayVector});
Expand Down
10 changes: 5 additions & 5 deletions velox/expression/tests/ExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3568,10 +3568,10 @@ TEST_P(ParameterizedExprTest, mapKeysAndValues) {
MAP(BIGINT(), BIGINT()),
makeNulls(vectorSize, nullEvery(3)),
vectorSize,
makeIndices(vectorSize, [](auto /* row */) { return 0; }),
makeIndices(vectorSize, folly::identity),
makeIndices(vectorSize, [](auto /* row */) { return 1; }),
makeFlatVector<int64_t>({1, 2, 3}),
makeFlatVector<int64_t>({10, 20, 30}));
makeFlatVector<int64_t>(vectorSize, folly::identity),
makeFlatVector<int64_t>(vectorSize, [](auto i) { return 10 * i; }));
auto input = makeRowVector({mapVector});
auto exprSet = compileMultiple(
{"map_keys(c0)", "map_values(c0)"}, asRowType(input->type()));
Expand Down Expand Up @@ -4805,7 +4805,7 @@ TEST_F(ExprTest, disablePeeling) {
// Verify that peeling is disabled when the config is set by checking whether
// the number of rows processed is equal to the alphabet size (when enabled)
// or the dictionary size (when disabled).
// Also, ensure that single arg function recieves a flat vector even when
// Also, ensure that single arg function receives a flat vector even when
// peeling is disabled.

// This throws if input is not flat or constant.
Expand Down Expand Up @@ -4847,7 +4847,7 @@ TEST_F(ExprTest, disablePeeling) {
ASSERT_TRUE(stats.find("plus") != stats.end());
ASSERT_EQ(stats["plus"].numProcessedRows, dictSize);

// Ensure single arg function recieves a flat vector.
// Ensure single arg function receives a flat vector.
// When top level column is dictionary wrapped.
ASSERT_NO_THROW(evaluateMultiple(
{"testing_single_arg_deterministic((c0))"},
Expand Down
2 changes: 1 addition & 1 deletion velox/functions/lib/tests/SliceTestBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class SliceTestBase : public FunctionBaseTest {
const ArrayVectorPtr& expectedArrayVector) {
auto result = evaluate<ArrayVector>(expression, makeRowVector(parameters));
::facebook::velox::test::assertEqualVectors(expectedArrayVector, result);
EXPECT_NO_THROW(expectedArrayVector->checkRanges());
EXPECT_FALSE(expectedArrayVector->hasOverlappingRanges());
}

void basicTestCases() {
Expand Down
12 changes: 6 additions & 6 deletions velox/functions/prestosql/tests/MapEntriesTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,14 @@ TEST_F(MapEntriesTest, differentSizedValueKeyVectors) {
makeNullableFlatVector<int64_t>({1, 2, 3, 4, std::nullopt, std::nullopt});
auto valueVector = makeFlatVector<int64_t>({1, 2, 3, 4});

auto offsetBuffer = makeIndices({3, 2, 1, 0, 0, 0});
auto sizeBuffer = makeIndices({1, 1, 1, 1, 1, 1});
auto offsetBuffer = makeIndices({3, 2, 1, 0});
auto sizeBuffer = makeIndices({1, 1, 1, 1});

auto mapVector = std::make_shared<MapVector>(
pool(),
MAP(BIGINT(), BIGINT()),
nullptr,
6,
4,
offsetBuffer,
sizeBuffer,
keyVector,
Expand All @@ -188,9 +188,9 @@ TEST_F(MapEntriesTest, differentSizedValueKeyVectors) {

EXPECT_NE(result, nullptr);
auto elementVector = makeRowVector(
{makeFlatVector<int64_t>({4, 3, 2, 1, 1, 1}),
makeFlatVector<int64_t>({4, 3, 2, 1, 1, 1})});
auto arrayVector = makeArrayVector({0, 1, 2, 3, 4, 5}, elementVector);
{makeFlatVector<int64_t>({4, 3, 2, 1}),
makeFlatVector<int64_t>({4, 3, 2, 1})});
auto arrayVector = makeArrayVector({0, 1, 2, 3}, elementVector);

test::assertEqualVectors(arrayVector, result);
}
2 changes: 1 addition & 1 deletion velox/functions/prestosql/tests/ZipTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ TEST_F(ZipTest, flatArraysWithDifferentOffsets) {

/// Test if we can zip two flat vectors of arrays with overlapping ranges of
/// elements.
TEST_F(ZipTest, flatArraysWithOverlappingRanges) {
TEST_F(ZipTest, DISABLED_flatArraysWithOverlappingRanges) {
const auto size = 3;
BufferPtr offsetsBuffer = allocateOffsets(size, pool_.get());
BufferPtr sizesBuffer = allocateSizes(size, pool_.get());
Expand Down
70 changes: 49 additions & 21 deletions velox/vector/ComplexVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -809,31 +809,59 @@ VectorPtr RowVector::pushDictionaryToRowVectorLeaves(const VectorPtr& input) {
wrappers, input->size(), input, input->pool());
}

void ArrayVectorBase::checkRanges() const {
std::unordered_map<vector_size_t, vector_size_t> seenElements;
seenElements.reserve(size());
template <bool kHasNulls>
vector_size_t ArrayVectorBase::nextNonEmpty(vector_size_t i) const {
while (i < size() &&
((kHasNulls && bits::isBitNull(rawNulls(), i)) || rawSizes_[i] <= 0)) {
++i;
}
return i;
}

template <bool kHasNulls>
bool ArrayVectorBase::maybeHaveOverlappingRanges() const {
vector_size_t curr = 0;
curr = nextNonEmpty<kHasNulls>(curr);
if (curr >= size()) {
return false;
}
for (;;) {
auto next = nextNonEmpty<kHasNulls>(curr + 1);
if (next >= size()) {
return false;
}
// This also implicitly ensures rawOffsets_[curr] <= rawOffsets_[next].
if (rawOffsets_[curr] + rawSizes_[curr] > rawOffsets_[next]) {
return true;
}
curr = next;
}
}

bool ArrayVectorBase::hasOverlappingRanges() const {
if (!(rawNulls() ? maybeHaveOverlappingRanges<true>()
: maybeHaveOverlappingRanges<false>())) {
return false;
}
std::vector<vector_size_t> indices;
indices.reserve(size());
for (vector_size_t i = 0; i < size(); ++i) {
auto size = sizeAt(i);
auto offset = offsetAt(i);

for (vector_size_t j = 0; j < size; ++j) {
auto it = seenElements.find(offset + j);
if (it != seenElements.end()) {
VELOX_FAIL(
"checkRanges() found overlap at idx {}: element {} has offset {} "
"and size {}, and element {} has offset {} and size {}.",
offset + j,
it->second,
offsetAt(it->second),
sizeAt(it->second),
i,
offset,
size);
}
seenElements.emplace(offset + j, i);
const bool isNull = rawNulls() && bits::isBitNull(rawNulls(), i);
if (!isNull && rawSizes_[i] > 0) {
indices.push_back(i);
}
}
std::sort(indices.begin(), indices.end(), [&](auto i, auto j) {
return rawOffsets_[i] < rawOffsets_[j];
});
for (vector_size_t i = 1; i < indices.size(); ++i) {
auto j = indices[i - 1];
auto k = indices[i];
if (rawOffsets_[j] + rawSizes_[j] > rawOffsets_[k]) {
return true;
}
}
return false;
}

void ArrayVectorBase::validateArrayVectorBase(
Expand Down
13 changes: 10 additions & 3 deletions velox/vector/ComplexVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,9 +364,8 @@ struct ArrayVectorBase : BaseVector {
sizes_->asMutable<vector_size_t>()[i] = size;
}

/// Verify that an ArrayVector/MapVector does not contain overlapping [offset,
/// size] ranges. Throws in case overlaps are found.
void checkRanges() const;
/// Check if there is any overlapping [offset, size] ranges.
bool hasOverlappingRanges() const;

protected:
ArrayVectorBase(
Expand Down Expand Up @@ -411,6 +410,14 @@ struct ArrayVectorBase : BaseVector {
const vector_size_t* rawOffsets_;
BufferPtr sizes_;
const vector_size_t* rawSizes_;

private:
template <bool kHasNulls>
bool maybeHaveOverlappingRanges() const;

// Return the next non-null non-empty array/map after `index'.
template <bool kHasNulls>
vector_size_t nextNonEmpty(vector_size_t index) const;
};

class ArrayVector : public ArrayVectorBase {
Expand Down
38 changes: 36 additions & 2 deletions velox/vector/tests/VectorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2882,7 +2882,7 @@ TEST_F(VectorTest, resizeArrayAndMapResetOffsets) {

// Test array.
{
auto offsets = makeIndices({1, 1, 1, 1});
auto offsets = makeIndices({0, 1, 2, 3});
auto sizes = makeIndices({1, 1, 1, 1});

auto* rawSizes = sizes->as<vector_size_t>();
Expand All @@ -2907,7 +2907,7 @@ TEST_F(VectorTest, resizeArrayAndMapResetOffsets) {

// Test map.
{
auto offsets = makeIndices({1, 1, 1, 1});
auto offsets = makeIndices({0, 1, 2, 3});
auto sizes = makeIndices({1, 1, 1, 1});

auto* rawSizes = sizes->as<vector_size_t>();
Expand Down Expand Up @@ -3899,5 +3899,39 @@ TEST_F(VectorTest, testOverSizedArray) {
EXPECT_THROW(BaseVector::flattenVector(constArray), VeloxUserError);
}

TEST_F(VectorTest, hasOverlappingRanges) {
auto test = [this](
vector_size_t length,
const std::vector<bool>& nulls,
const std::vector<vector_size_t>& offsets,
const std::vector<vector_size_t>& sizes,
bool overlap) {
auto makeArray = [&] {
return std::make_shared<ArrayVector>(
pool(),
ARRAY(BIGINT()),
makeNulls(nulls),
length,
makeIndices(length, [&](auto i) { return offsets[i]; }),
makeIndices(length, [&](auto i) { return sizes[i]; }),
makeNullConstant(
TypeKind::BIGINT, std::numeric_limits<vector_size_t>::max()));
};
if (!overlap) {
ASSERT_FALSE(makeArray()->hasOverlappingRanges());
} else {
ASSERT_TRUE(makeArray()->hasOverlappingRanges());
}
};
test(3, {false, false, false}, {0, 1, 2}, {1, 1, 1}, false);
test(3, {false, true, false}, {0, 0, 2}, {1, 1, 1}, false);
test(3, {false, false, false}, {0, 0, 2}, {1, 0, 1}, false);
test(3, {false, false, false}, {0, 0, 2}, {1, 1, 1}, true);
test(3, {false, false, false}, {2, 1, 0}, {1, 1, 1}, false);
test(3, {false, false, false}, {2, 1, 0}, {1, 2, 1}, true);
test(2, {false, false}, {0, 1}, {3, 1}, true);
test(2, {false, false}, {1, 0}, {1, 3}, true);
}

} // namespace
} // namespace facebook::velox

0 comments on commit 888c044

Please sign in to comment.