From b0c2f44d379dc3648b368fba254c3997c13d7bf3 Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Fri, 7 Jun 2024 10:09:01 -0700 Subject: [PATCH] Clean up some spurious shared_ptr creations/deletions in DwrfReader (#10095) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/10095 I was looking at a query reading from a table with many columns, and it's spending a huge amount of CPU time just creating and deleting shared_ptrs when setting up the SplitReader and the RowReader. There's still more that can be done but this change cleans up some easy ones. Reviewed By: xiaoxmeng, Yuhta Differential Revision: D58263307 fbshipit-source-id: 3cbedfa1c47d97ae63029bceb400b7b7dc72a47d --- velox/dwio/common/ColumnSelector.cpp | 2 +- velox/dwio/common/FilterNode.h | 4 ++-- velox/dwio/dwrf/reader/DwrfReader.cpp | 26 +++++++++++++++----------- velox/dwio/dwrf/reader/ReaderBase.cpp | 13 +++++++++++-- velox/dwio/dwrf/reader/ReaderBase.h | 4 ++-- 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/velox/dwio/common/ColumnSelector.cpp b/velox/dwio/common/ColumnSelector.cpp index 4d696c99ff2a..5074ca55afd9 100644 --- a/velox/dwio/common/ColumnSelector.cpp +++ b/velox/dwio/common/ColumnSelector.cpp @@ -91,7 +91,7 @@ FilterTypePtr ColumnSelector::buildNode( // column selector filter tree nodes_.reserve(nodes_.size() + type->size()); if (node.node == 0) { - auto rowType = type->asRow(); + auto& rowType = type->asRow(); for (size_t i = 0, size = type->size(); i < size; ++i) { bool inData = contentType && i < contentType->size(); current->addChild(buildNode( diff --git a/velox/dwio/common/FilterNode.h b/velox/dwio/common/FilterNode.h index eb268dd9ab33..894fa3550dbc 100644 --- a/velox/dwio/common/FilterNode.h +++ b/velox/dwio/common/FilterNode.h @@ -267,8 +267,8 @@ class FilterType { return node_.node == 0; } - inline void addChild(const FilterTypePtr& child) { - children_.push_back(child); + inline void addChild(FilterTypePtr child) { + children_.push_back(std::move(child)); } inline void setSequenceFilter(const SeqFilter& seqFilter) { diff --git a/velox/dwio/dwrf/reader/DwrfReader.cpp b/velox/dwio/dwrf/reader/DwrfReader.cpp index 25029ce2a284..e505be033421 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.cpp +++ b/velox/dwio/dwrf/reader/DwrfReader.cpp @@ -791,21 +791,29 @@ TypePtr updateColumnNames(const TypePtr& fileType, const TypePtr& tableType) { auto fileRowType = std::dynamic_pointer_cast(fileType); auto tableRowType = std::dynamic_pointer_cast(tableType); - std::vector newFileFieldNames{fileRowType->names()}; - std::vector newFileFieldTypes{fileRowType->children()}; + std::vector newFileFieldNames; + newFileFieldNames.reserve(fileRowType->size()); + std::vector newFileFieldTypes; + newFileFieldTypes.reserve(fileRowType->size()); for (auto childIdx = 0; childIdx < tableRowType->size(); ++childIdx) { if (childIdx >= fileRowType->size()) { break; } - newFileFieldTypes[childIdx] = updateColumnNames( + newFileFieldTypes.push_back(updateColumnNames( fileRowType->childAt(childIdx), tableRowType->childAt(childIdx), fileRowType->nameOf(childIdx), - tableRowType->nameOf(childIdx)); + tableRowType->nameOf(childIdx))); - newFileFieldNames[childIdx] = tableRowType->nameOf(childIdx); + newFileFieldNames.push_back(tableRowType->nameOf(childIdx)); + } + + for (auto childIdx = tableRowType->size(); childIdx < fileRowType->size(); + ++childIdx) { + newFileFieldTypes.push_back(fileRowType->childAt(childIdx)); + newFileFieldNames.push_back(fileRowType->nameOf(childIdx)); } return std::make_shared( @@ -830,9 +838,6 @@ TypePtr updateColumnNames( return fileType; } - std::vector fileFieldNames{fileType->size()}; - std::vector fileFieldTypes{fileType->size()}; - if (fileType->isRow()) { return updateColumnNames(fileType, tableType); } @@ -855,9 +860,8 @@ TypePtr updateColumnNames( void DwrfReader::updateColumnNamesFromTableSchema() { const auto& tableSchema = options_.fileSchema(); const auto& fileSchema = readerBase_->getSchema(); - auto newSchema = std::dynamic_pointer_cast( - updateColumnNames(fileSchema, tableSchema, "", "")); - readerBase_->setSchema(newSchema); + readerBase_->setSchema(std::dynamic_pointer_cast( + updateColumnNames(fileSchema, tableSchema, "", ""))); } std::unique_ptr DwrfReader::getStripe( diff --git a/velox/dwio/dwrf/reader/ReaderBase.cpp b/velox/dwio/dwrf/reader/ReaderBase.cpp index 734468f8515e..085ce2c501f5 100644 --- a/velox/dwio/dwrf/reader/ReaderBase.cpp +++ b/velox/dwio/dwrf/reader/ReaderBase.cpp @@ -289,23 +289,32 @@ std::shared_ptr ReaderBase::convertType( const auto type = footer.types(index); switch (type.kind()) { case TypeKind::BOOLEAN: + return BOOLEAN(); case TypeKind::TINYINT: + return TINYINT(); case TypeKind::SMALLINT: + return SMALLINT(); case TypeKind::INTEGER: + return INTEGER(); case TypeKind::BIGINT: + return BIGINT(); case TypeKind::HUGEINT: if (type.format() == DwrfFormat::kOrc && type.getOrcPtr()->kind() == proto::orc::Type_Kind_DECIMAL) { return DECIMAL( type.getOrcPtr()->precision(), type.getOrcPtr()->scale()); } - [[fallthrough]]; + return HUGEINT(); case TypeKind::REAL: + return REAL(); case TypeKind::DOUBLE: + return DOUBLE(); case TypeKind::VARCHAR: + return VARCHAR(); case TypeKind::VARBINARY: + return VARBINARY(); case TypeKind::TIMESTAMP: - return createScalarType(type.kind()); + return TIMESTAMP(); case TypeKind::ARRAY: return ARRAY(convertType( footer, type.subtypes(0), fileColumnNamesReadAsLowerCase)); diff --git a/velox/dwio/dwrf/reader/ReaderBase.h b/velox/dwio/dwrf/reader/ReaderBase.h index b4d91f5138f7..2805285a74c8 100644 --- a/velox/dwio/dwrf/reader/ReaderBase.h +++ b/velox/dwio/dwrf/reader/ReaderBase.h @@ -124,8 +124,8 @@ class ReaderBase { return schema_; } - void setSchema(const RowTypePtr& newSchema) { - schema_ = newSchema; + void setSchema(RowTypePtr newSchema) { + schema_ = std::move(newSchema); } const std::shared_ptr& getSchemaWithId()