From afd6753a6e42038d6df8488bb70c7688a7030f06 Mon Sep 17 00:00:00 2001 From: "Huameng (Michael) Jiang" Date: Tue, 23 Jul 2024 12:29:44 -0700 Subject: [PATCH] Selective array and map column reader (#10448) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/10448 Implement selective array and map column reader. This is another type of top level column without independent null streams, hence requiring some new functionalities for loading nullable encoding. There is another nuance in the diff where selective reader currently always loads the nulls first and then the values, and passes the combined nulls into readLengths methods instead of just the top level incoming nulls for scattering. We have 3 more ideal options 1) a materializeNonNull api for encodings 2) a materialize materializeNullable api for encodings for combined nulls 3) a way to have selective reader not having to materialize combined nulls without compromising efficiency. For now we have added a hack in NimbleData to load values along with the nulls for nullable encodings and return the cached value when calling readLengths later. In order to fit this access pattern, we also override the skip methods. Reviewed By: Yuhta Differential Revision: D58937281 fbshipit-source-id: e821d8bc8b6637df2d2c79b05661d94f209c3fa2 --- .../dwio/common/tests/utils/DataSetBuilder.cpp | 17 ++++++++++++++--- velox/dwio/common/tests/utils/DataSetBuilder.h | 3 ++- .../common/tests/utils/E2EFilterTestBase.cpp | 13 ++++++++----- .../dwio/common/tests/utils/E2EFilterTestBase.h | 6 ++++-- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/velox/dwio/common/tests/utils/DataSetBuilder.cpp b/velox/dwio/common/tests/utils/DataSetBuilder.cpp index f9d532c84eab..07b6b245003f 100644 --- a/velox/dwio/common/tests/utils/DataSetBuilder.cpp +++ b/velox/dwio/common/tests/utils/DataSetBuilder.cpp @@ -37,7 +37,8 @@ RowTypePtr DataSetBuilder::makeRowType( DataSetBuilder& DataSetBuilder::makeDataset( RowTypePtr rowType, const size_t batchCount, - const size_t numRows) { + const size_t numRows, + const bool withRecursiveNulls) { if (batches_) { batches_->clear(); } else { @@ -45,8 +46,18 @@ DataSetBuilder& DataSetBuilder::makeDataset( } for (size_t i = 0; i < batchCount; ++i) { - batches_->push_back(std::static_pointer_cast( - BatchMaker::createBatch(rowType, numRows, pool_, nullptr, i))); + if (withRecursiveNulls) { + batches_->push_back(std::static_pointer_cast( + BatchMaker::createBatch(rowType, numRows, pool_, nullptr, i))); + } else { + batches_->push_back( + std::static_pointer_cast(BatchMaker::createBatch( + rowType, + numRows, + pool_, + [](vector_size_t /*index*/) { return false; }, + i))); + } } return *this; diff --git a/velox/dwio/common/tests/utils/DataSetBuilder.h b/velox/dwio/common/tests/utils/DataSetBuilder.h index d43fe28518a8..4893c28336f6 100644 --- a/velox/dwio/common/tests/utils/DataSetBuilder.h +++ b/velox/dwio/common/tests/utils/DataSetBuilder.h @@ -43,7 +43,8 @@ class DataSetBuilder { DataSetBuilder& makeDataset( RowTypePtr rowType, const size_t batchCount, - const size_t numRows); + const size_t numRows, + const bool withRecursiveNulls = true); // Adds high values to 'batches_' so that these values occur only in some row // groups. Tests skipping row groups based on row group stats. diff --git a/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp b/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp index 755ff7157390..287d245c8408 100644 --- a/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp +++ b/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp @@ -46,12 +46,14 @@ using velox::common::Subfield; std::vector E2EFilterTestBase::makeDataset( std::function customize, - bool forRowGroupSkip) { + bool forRowGroupSkip, + bool withRecursiveNulls) { if (!dataSetBuilder_) { dataSetBuilder_ = std::make_unique(*leafPool_, 0); } - dataSetBuilder_->makeDataset(rowType_, batchCount_, batchSize_); + dataSetBuilder_->makeDataset( + rowType_, batchCount_, batchSize_, withRecursiveNulls); if (forRowGroupSkip) { dataSetBuilder_->withRowGroupSpecificData(kRowsInGroup); @@ -408,17 +410,18 @@ void E2EFilterTestBase::testScenario( std::function customize, bool wrapInStruct, const std::vector& filterable, - int32_t numCombinations) { + int32_t numCombinations, + bool withRecursiveNulls) { rowType_ = DataSetBuilder::makeRowType(columns, wrapInStruct); filterGenerator_ = std::make_unique(rowType_, seed_); - auto batches = makeDataset(customize, false); + auto batches = makeDataset(customize, false, withRecursiveNulls); writeToMemory(rowType_, batches, false); testNoRowGroupSkip(batches, filterable, numCombinations); testPruningWithFilter(batches, filterable); if (testRowGroupSkip_) { - batches = makeDataset(customize, true); + batches = makeDataset(customize, true, withRecursiveNulls); writeToMemory(rowType_, batches, true); testRowGroupSkip(batches, filterable); } diff --git a/velox/dwio/common/tests/utils/E2EFilterTestBase.h b/velox/dwio/common/tests/utils/E2EFilterTestBase.h index f26ac8beef1b..f0e9d1daa0c9 100644 --- a/velox/dwio/common/tests/utils/E2EFilterTestBase.h +++ b/velox/dwio/common/tests/utils/E2EFilterTestBase.h @@ -105,7 +105,8 @@ class E2EFilterTestBase : public testing::Test { std::vector makeDataset( std::function customize, - bool forRowGroupSkip); + bool forRowGroupSkip, + bool withRecursiveNulls); void makeAllNulls(const std::string& fieldName); @@ -297,7 +298,8 @@ class E2EFilterTestBase : public testing::Test { std::function customize, bool wrapInStruct, const std::vector& filterable, - int32_t numCombinations); + int32_t numCombinations, + bool withRecursiveNulls = true); private: void testMetadataFilterImpl(