From 1195ef5535163a6b22965ab75e06890bd1a880c3 Mon Sep 17 00:00:00 2001 From: Orri Erling Date: Thu, 7 Dec 2023 14:24:55 -0800 Subject: [PATCH] Fix parallel build overflow for last bucket A parallel build inserts one range of buckets per thread. If an insert does not fit in the last bucket in the range, it is added to overflows. The overflows are inserted sequentially at the end of the build. When inserting overflows, htere are no partition bounds and as long as there is at least one free slot the insert cannot fail. However, when inserting the overflows, the upper bound of the partition must be -1 to indicate no bounds. If it is sizeMask + 1 and the last bucket is full, the insert cannot wrap around to the first bucket like it should. --- velox/exec/HashTable.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/velox/exec/HashTable.cpp b/velox/exec/HashTable.cpp index 8ecbe18145ac..ded3a1753df5 100644 --- a/velox/exec/HashTable.cpp +++ b/velox/exec/HashTable.cpp @@ -930,12 +930,7 @@ void HashTable::parallelJoinBuild() { false, hashes); insertForJoin( - overflows.data(), - hashes.data(), - overflows.size(), - 0, - sizeMask_ + 1, - nullptr); + overflows.data(), hashes.data(), overflows.size(), 0, -1, nullptr); auto table = i == 0 ? this : otherTables_[i - 1].get(); VELOX_CHECK_EQ(table->rows()->numRows(), table->numParallelBuildRows_); } @@ -1113,6 +1108,9 @@ FOLLY_ALWAYS_INLINE void HashTable::buildFullProbe( PartitionBoundIndexType partitionBegin, PartitionBoundIndexType partitionEnd, std::vector* overflows) { + VELOX_DCHECK( + partitionEnd >= 0 ? overflows == nullptr : overflows != nullptr, + "if partition bounds are given, overflows must also be given."); auto insertFn = [&](int32_t /*row*/, PartitionBoundIndexType index) { if (index < partitionBegin || index >= partitionEnd) { overflows->push_back(inserted);