-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Optimize count(distinct <complex type>) (#8560)
Summary: Aggregations over distinct inputs use `SetAccumulator<ComplexType>`, which in turn uses AddressableNonNullValueList to store unique complex type values in a single non-contiguous allocation within HashStringAllocator. Storing thousands or millions of values requires allocation with hundrends or thousands of contiguous pieces. Calling HashStringAllocator::prepareRead on such allocation ends up populating an `std::vector<ByteRange>` with thousands of entries. Calling this repeatedly for each value as part of SetAccumulator::extractValues() becomes very slow. SetAccumulator::extractValues calls AddressableNonNullValueList::read for each unique value (there can be millions of these): for (const auto& position : base.uniqueValues) { AddressableNonNullValueList::read( position.first.position, values, offset + position.second); } AddressableNonNullValueList::read calls HashStringAllocator::prepareRead, which collects thousands of byte ranges into a vector passed to ByteInputStream constructor: auto stream = HashStringAllocator::prepareRead(header); As a result, queries like count(distrinct <complex type>) are very slow. A fix is to modify HashStringAllocator::prepareRead to accept an optional limit on how many bytes to prepare for read, then use this in AddressableNonNullValueList::read to prepare only as many bytes as needed to extract a single value. In addition, do not store hashes of the values in HSA to avoid calling HashStringAllocator::prepareRead altogether just to fetch the hash. Added benchmark for SetAccumulator to add 10M mostly unique values and read them back. Without the optimizations, the benchmark couldn't finish within a reasonable time (a few mininutes). Changing benchmark to process 100K values allowed it to compelete. Before (100K values): ``` ============================================================================ [...]enchmarks/SetAccumulatorBenchmark.cpp relative time/iter iters/s ============================================================================ bigint 2.87ms 348.07 varchar 22.22ms 45.01 twoBigints 988.08ms 1.01 ``` After (100K values): ``` ============================================================================ [...]enchmarks/SetAccumulatorBenchmark.cpp relative time/iter iters/s ============================================================================ bigint 2.80ms 356.87 varchar 21.19ms 47.19 twoBigints 38.83ms 25.76 ``` After the optimizations, the original benchmark processing 10M values is finishing within a few seconds. After (10M values): ``` ============================================================================ [...]enchmarks/SetAccumulatorBenchmark.cpp relative time/iter iters/s ============================================================================ bigint 1.23s 814.20m varchar 2.96s 338.39m twoBigints 6.30s 158.70m ``` Pull Request resolved: #8560 Reviewed By: Yuhta Differential Revision: D53130262 Pulled By: mbasmanova fbshipit-source-id: 9401d56fc8f9d4eecdaa4bd2ef53ae6e5f6f4f07
- Loading branch information
1 parent
a113acf
commit 5b8b4bd
Showing
9 changed files
with
209 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#include <folly/Benchmark.h> | ||
#include <folly/init/Init.h> | ||
|
||
#include "velox/common/memory/Memory.h" | ||
#include "velox/exec/SetAccumulator.h" | ||
#include "velox/vector/fuzzer/VectorFuzzer.h" | ||
#include "velox/vector/tests/utils/VectorTestBase.h" | ||
|
||
using namespace facebook::velox; | ||
using namespace facebook::velox::exec; | ||
|
||
namespace { | ||
|
||
// Adds 10M mostly unique values to a single SetAccumulator, then extracts | ||
// unique values from it. | ||
class SetAccumulatorBenchmark : public facebook::velox::test::VectorTestBase { | ||
public: | ||
void setup() { | ||
VectorFuzzer::Options opts; | ||
opts.vectorSize = 1'000'000; | ||
VectorFuzzer fuzzer(opts, pool()); | ||
|
||
auto rowType = ROW({"a", "b", "c"}, {BIGINT(), BIGINT(), VARCHAR()}); | ||
for (auto i = 0; i < 10; ++i) { | ||
rowVectors_.emplace_back(fuzzer.fuzzInputRow(rowType)); | ||
} | ||
} | ||
|
||
void runBigint() { | ||
runPrimitive<int64_t>("a"); | ||
} | ||
|
||
void runVarchar() { | ||
runPrimitive<StringView>("c"); | ||
} | ||
|
||
void runTwoBigints() { | ||
HashStringAllocator allocator(pool()); | ||
const TypePtr type = ROW({BIGINT(), BIGINT()}); | ||
aggregate::prestosql::SetAccumulator<ComplexType> accumulator( | ||
type, &allocator); | ||
|
||
for (const auto& rowVector : rowVectors_) { | ||
auto vector = | ||
makeRowVector({rowVector->childAt("a"), rowVector->childAt("b")}); | ||
DecodedVector decoded(*vector); | ||
for (auto i = 0; i < rowVector->size(); ++i) { | ||
accumulator.addValue(decoded, i, &allocator); | ||
} | ||
} | ||
|
||
auto result = BaseVector::create(type, accumulator.size(), pool()); | ||
accumulator.extractValues(*result, 0); | ||
folly::doNotOptimizeAway(result); | ||
} | ||
|
||
private: | ||
template <typename T> | ||
void runPrimitive(const std::string& name) { | ||
const auto& type = rowVectors_[0]->childAt(name)->type(); | ||
|
||
HashStringAllocator allocator(pool()); | ||
aggregate::prestosql::SetAccumulator<T> accumulator(type, &allocator); | ||
|
||
for (const auto& rowVector : rowVectors_) { | ||
DecodedVector decoded(*rowVector->childAt(name)); | ||
for (auto i = 0; i < rowVector->size(); ++i) { | ||
accumulator.addValue(decoded, i, &allocator); | ||
} | ||
} | ||
|
||
auto result = | ||
BaseVector::create<FlatVector<T>>(type, accumulator.size(), pool()); | ||
accumulator.extractValues(*result, 0); | ||
folly::doNotOptimizeAway(result); | ||
} | ||
|
||
std::vector<RowVectorPtr> rowVectors_; | ||
}; | ||
|
||
std::unique_ptr<SetAccumulatorBenchmark> bm; | ||
|
||
BENCHMARK(bigint) { | ||
bm->runBigint(); | ||
} | ||
|
||
BENCHMARK(varchar) { | ||
bm->runVarchar(); | ||
} | ||
|
||
BENCHMARK(twoBigints) { | ||
bm->runTwoBigints(); | ||
} | ||
|
||
} // namespace | ||
|
||
int main(int argc, char** argv) { | ||
folly::init(&argc, &argv); | ||
memory::MemoryManager::initialize({}); | ||
|
||
bm = std::make_unique<SetAccumulatorBenchmark>(); | ||
bm->setup(); | ||
|
||
folly::runBenchmarks(); | ||
|
||
bm.reset(); | ||
|
||
return 0; | ||
} |
Oops, something went wrong.