From 2e541f710e37af16c498e9ad6a3f9dfda8c5a425 Mon Sep 17 00:00:00 2001 From: Jimmy Lu Date: Mon, 3 Jun 2024 08:57:13 -0700 Subject: [PATCH] Optimize readWithVisitor for TrivialEncoding and MainlyConstantEncoding Summary: - Fast path for `TrivialEncoding::readWithVisitor` - Fast path for `MainlyConstantEncoding::readWithVisitor` - Store `encodingType`, `dataType`, `rowCount` in `Encoding` object memory to reduce memory fetch on `data_` - Use skip functor only in `readWithVisitorSlow` to avoid virtual call cost Differential Revision: D58085138 --- velox/dwio/common/SelectiveColumnReader.h | 13 ++++++----- .../common/SelectiveColumnReaderInternal.h | 1 - .../dwio/common/tests/utils/DataSetBuilder.h | 22 +++++++++++++++++++ .../common/tests/utils/E2EFilterTestBase.h | 5 +++++ 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/velox/dwio/common/SelectiveColumnReader.h b/velox/dwio/common/SelectiveColumnReader.h index 301deac03dfba..ab0797e6aecc3 100644 --- a/velox/dwio/common/SelectiveColumnReader.h +++ b/velox/dwio/common/SelectiveColumnReader.h @@ -209,9 +209,13 @@ class SelectiveColumnReader { // Returns a pointer to output rows with at least 'size' elements available. vector_size_t* mutableOutputRows(int32_t size) { - numOutConfirmed_ = outputRows_.size(); - outputRows_.resize(numOutConfirmed_ + size); - return outputRows_.data() + numOutConfirmed_; + auto numOutConfirmed = outputRows_.size(); + outputRows_.resize(numOutConfirmed + size); + return outputRows_.data() + numOutConfirmed; + } + + void* rawValues() { + return rawValues_; } template @@ -616,9 +620,6 @@ class SelectiveColumnReader { // Rows passing the filter in readWithVisitor. Must stay // constant between consecutive calls to read(). raw_vector outputRows_; - // Index of last set value in outputRows. Values between this and - // size() can be used as scratchpad inside read(). - vector_size_t numOutConfirmed_; // The row number // corresponding to each element in 'values_' raw_vector valueRows_; diff --git a/velox/dwio/common/SelectiveColumnReaderInternal.h b/velox/dwio/common/SelectiveColumnReaderInternal.h index b6cf9ebe59e6a..01819fa41c31f 100644 --- a/velox/dwio/common/SelectiveColumnReaderInternal.h +++ b/velox/dwio/common/SelectiveColumnReaderInternal.h @@ -95,7 +95,6 @@ void SelectiveColumnReader::prepareRead( outputRows_.clear(); // is part of read() and after read returns getValues may be called. mayGetValues_ = true; - numOutConfirmed_ = 0; numValues_ = 0; valueSize_ = sizeof(T); inputRows_ = rows; diff --git a/velox/dwio/common/tests/utils/DataSetBuilder.h b/velox/dwio/common/tests/utils/DataSetBuilder.h index 2bc7ccc31494e..d43fe28518a8d 100644 --- a/velox/dwio/common/tests/utils/DataSetBuilder.h +++ b/velox/dwio/common/tests/utils/DataSetBuilder.h @@ -141,6 +141,28 @@ class DataSetBuilder { return *this; } + template + DataSetBuilder& withIntMainlyConstantForField(const common::Subfield& field) { + for (auto& batch : *batches_) { + std::optional value; + auto* numbers = dwio::common::getChildBySubfield(batch.get(), field) + ->as>(); + for (auto row = 0; row < numbers->size(); ++row) { + if (numbers->isNullAt(row)) { + continue; + } + if (folly::Random::randDouble01(rng_) < 0.95) { + if (!value.has_value()) { + value = numbers->valueAt(row); + } else { + numbers->set(row, *value); + } + } + } + } + return *this; + } + template DataSetBuilder& withQuantizedFloatForField( const common::Subfield& field, diff --git a/velox/dwio/common/tests/utils/E2EFilterTestBase.h b/velox/dwio/common/tests/utils/E2EFilterTestBase.h index 0dfe30d239490..b6fe73677909d 100644 --- a/velox/dwio/common/tests/utils/E2EFilterTestBase.h +++ b/velox/dwio/common/tests/utils/E2EFilterTestBase.h @@ -144,6 +144,11 @@ class E2EFilterTestBase : public testing::Test { dataSetBuilder_->withIntRleForField(Subfield(fieldName)); } + template + void makeIntMainlyConstant(const std::string& fieldName) { + dataSetBuilder_->withIntMainlyConstantForField(Subfield(fieldName)); + } + template void makeQuantizedFloat( const std::string& fieldName,