From 3533f9430e375a5af52b280cf255615f9b5dd396 Mon Sep 17 00:00:00 2001 From: Jimmy Lu Date: Mon, 25 Mar 2024 13:02:32 -0700 Subject: [PATCH] Change TypeWithId to take exclusive ownership on children to avoid confusion Summary: Because the constructor is reparenting the children, taking shared ownership on children is misleading, especially when they are `const` qualified. Differential Revision: D55318609 --- velox/dwio/common/TypeUtils.cpp | 34 ++++--- velox/dwio/common/TypeWithId.cpp | 24 +++-- velox/dwio/common/TypeWithId.h | 10 +- velox/dwio/common/tests/TypeTests.cpp | 2 +- velox/dwio/dwrf/test/ColumnWriterTest.cpp | 2 +- velox/dwio/dwrf/test/TestColumnReader.cpp | 2 +- velox/dwio/parquet/reader/ParquetReader.cpp | 97 +++++++++---------- .../dwio/parquet/reader/ParquetTypeWithId.cpp | 37 ++++++- velox/dwio/parquet/reader/ParquetTypeWithId.h | 4 +- 9 files changed, 134 insertions(+), 78 deletions(-) diff --git a/velox/dwio/common/TypeUtils.cpp b/velox/dwio/common/TypeUtils.cpp index 312b9a6e6eaf3..fb58d6a6d0729 100644 --- a/velox/dwio/common/TypeUtils.cpp +++ b/velox/dwio/common/TypeUtils.cpp @@ -39,49 +39,53 @@ void checkChildrenSelected( } } -std::shared_ptr visit( +std::unique_ptr visit( const std::shared_ptr& typeWithId, const std::function& selector) { if (typeWithId->type()->isPrimitiveType()) { - return typeWithId; + return std::make_unique( + typeWithId->type(), + std::vector>(), + typeWithId->id(), + typeWithId->maxId(), + typeWithId->column()); } if (typeWithId->type()->isRow()) { std::vector names; - std::vector> typesWithId; + std::vector> children; std::vector> types; auto& row = typeWithId->type()->asRow(); for (auto i = 0; i < typeWithId->size(); ++i) { auto& child = typeWithId->childAt(i); if (selector(child->id())) { names.push_back(row.nameOf(i)); - std::shared_ptr twid; - twid = visit(child, selector); - typesWithId.push_back(twid); - types.push_back(twid->type()); + auto newChild = visit(child, selector); + types.push_back(newChild->type()); + children.push_back(std::move(newChild)); } } VELOX_USER_CHECK( !types.empty(), "selected nothing from row: " + row.toString()); - return std::make_shared( + return std::make_unique( ROW(std::move(names), std::move(types)), - std::move(typesWithId), + std::move(children), typeWithId->id(), typeWithId->maxId(), typeWithId->column()); } else { checkChildrenSelected(typeWithId, selector); - std::vector> typesWithId; + std::vector> children; std::vector> types; for (auto i = 0; i < typeWithId->size(); ++i) { auto& child = typeWithId->childAt(i); - std::shared_ptr twid = visit(child, selector); - typesWithId.push_back(twid); - types.push_back(twid->type()); + auto newChild = visit(child, selector); + types.push_back(newChild->type()); + children.push_back(std::move(newChild)); } auto type = createType(typeWithId->type()->kind(), std::move(types)); - return std::make_shared( + return std::make_unique( type, - std::move(typesWithId), + std::move(children), typeWithId->id(), typeWithId->maxId(), typeWithId->column()); diff --git a/velox/dwio/common/TypeWithId.cpp b/velox/dwio/common/TypeWithId.cpp index 6436aa25f55fb..6b803a6249da4 100644 --- a/velox/dwio/common/TypeWithId.cpp +++ b/velox/dwio/common/TypeWithId.cpp @@ -22,9 +22,21 @@ namespace facebook::velox::dwio::common { using velox::Type; using velox::TypeKind; +namespace { +std::vector> toShared( + std::vector> nodes) { + std::vector> result; + result.reserve(nodes.size()); + for (auto&& node : nodes) { + result.emplace_back(std::move(node)); + } + return result; +} +} // namespace + TypeWithId::TypeWithId( std::shared_ptr type, - std::vector>&& children, + std::vector>&& children, uint32_t id, uint32_t maxId, uint32_t column) @@ -33,13 +45,13 @@ TypeWithId::TypeWithId( id_{id}, maxId_{maxId}, column_{column}, - children_{std::move(children)} { + children_{toShared(std::move(children))} { for (auto& child : children_) { const_cast(child->parent_) = this; } } -std::shared_ptr TypeWithId::create( +std::unique_ptr TypeWithId::create( const std::shared_ptr& root, uint32_t next) { return create(root, next, 0); @@ -54,13 +66,13 @@ const std::shared_ptr& TypeWithId::childAt( return children_.at(idx); } -std::shared_ptr TypeWithId::create( +std::unique_ptr TypeWithId::create( const std::shared_ptr& type, uint32_t& next, uint32_t column) { DWIO_ENSURE_NOT_NULL(type); const uint32_t myId = next++; - std::vector> children{}; + std::vector> children; children.reserve(type->size()); auto offset = 0; for (const auto& child : *type) { @@ -70,7 +82,7 @@ std::shared_ptr TypeWithId::create( (myId == 0 && type->kind() == TypeKind::ROW) ? offset++ : column)); } const uint32_t maxId = next - 1; - return std::make_shared( + return std::make_unique( type, std::move(children), myId, maxId, column); } diff --git a/velox/dwio/common/TypeWithId.h b/velox/dwio/common/TypeWithId.h index 953ac87b2b8c3..39a988f9936a9 100644 --- a/velox/dwio/common/TypeWithId.h +++ b/velox/dwio/common/TypeWithId.h @@ -24,14 +24,18 @@ namespace facebook::velox::dwio::common { class TypeWithId : public velox::Tree> { public: + /// NOTE: This constructor will re-parent the children. TypeWithId( std::shared_ptr type, - std::vector>&& children, + std::vector>&& children, uint32_t id, uint32_t maxId, uint32_t column); - static std::shared_ptr create( + TypeWithId(const TypeWithId&) = delete; + TypeWithId& operator=(const TypeWithId&) = delete; + + static std::unique_ptr create( const std::shared_ptr& root, uint32_t next = 0); @@ -70,7 +74,7 @@ class TypeWithId : public velox::Tree> { } private: - static std::shared_ptr create( + static std::unique_ptr create( const std::shared_ptr& type, uint32_t& next, uint32_t column); diff --git a/velox/dwio/common/tests/TypeTests.cpp b/velox/dwio/common/tests/TypeTests.cpp index f4ad3f635259f..ce7e4927ad7c5 100644 --- a/velox/dwio/common/tests/TypeTests.cpp +++ b/velox/dwio/common/tests/TypeTests.cpp @@ -39,7 +39,7 @@ TEST(TestType, selectedType) { "col3:map,col4:float," "col5:int,col6:bigint,col7:string>", HiveTypeSerializer::serialize(type).c_str()); - auto typeWithId = TypeWithId::create(type); + std::shared_ptr typeWithId = TypeWithId::create(type); EXPECT_EQ(0, typeWithId->id()); EXPECT_EQ(11, typeWithId->maxId()); diff --git a/velox/dwio/dwrf/test/ColumnWriterTest.cpp b/velox/dwio/dwrf/test/ColumnWriterTest.cpp index f7a2b1249d4fa..1918ff0453168 100644 --- a/velox/dwio/dwrf/test/ColumnWriterTest.cpp +++ b/velox/dwio/dwrf/test/ColumnWriterTest.cpp @@ -4239,7 +4239,7 @@ TEST_F(ColumnWriterTest, ColumnIdInStream) { const uint32_t kColumnId = 2; auto typeWithId = std::make_shared( type, - std::vector>{}, + std::vector>{}, /* id */ kNodeId, /* maxId */ kNodeId, /* column */ kColumnId); diff --git a/velox/dwio/dwrf/test/TestColumnReader.cpp b/velox/dwio/dwrf/test/TestColumnReader.cpp index 88655f49e515c..67194d51df5ff 100644 --- a/velox/dwio/dwrf/test/TestColumnReader.cpp +++ b/velox/dwio/dwrf/test/TestColumnReader.cpp @@ -133,7 +133,7 @@ class ColumnReaderTestBase { EXPECT_CALL(streams_, getRowReaderOptionsProxy()) .WillRepeatedly(testing::Return(&options)); - auto fileTypeWithId = + std::shared_ptr fileTypeWithId = TypeWithId::create(fileType ? fileType : requestedType); if (useSelectiveReader()) { diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 72f36943896f2..76b01b4bd33b8 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -90,7 +90,7 @@ class ReaderBase { void initializeSchema(); - std::shared_ptr getParquetColumnInfo( + std::unique_ptr getParquetColumnInfo( uint32_t maxSchemaElementIdx, uint32_t maxRepeat, uint32_t maxDefine, @@ -100,9 +100,9 @@ class ReaderBase { TypePtr convertType(const thrift::SchemaElement& schemaElement) const; + template static std::shared_ptr createRowType( - std::vector> - children, + const std::vector& children, bool fileColumnNamesReadAsLowerCase); memory::MemoryPool& pool_; @@ -222,7 +222,7 @@ void ReaderBase::initializeSchema() { schemaWithId_->getChildren(), isFileColumnNamesReadAsLowerCase()); } -std::shared_ptr ReaderBase::getParquetColumnInfo( +std::unique_ptr ReaderBase::getParquetColumnInfo( uint32_t maxSchemaElementIdx, uint32_t maxRepeat, uint32_t maxDefine, @@ -256,7 +256,7 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( schemaElement.__isset.num_children && schemaElement.num_children > 0, "Node has no children but should"); - std::vector> children; + std::vector> children; auto curSchemaIdx = schemaIdx; for (int32_t i = 0; i < schemaElement.num_children; i++) { @@ -267,7 +267,7 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( curSchemaIdx, ++schemaIdx, columnIdx); - children.push_back(child); + children.push_back(std::move(child)); } VELOX_CHECK(!children.empty()); @@ -284,11 +284,11 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( thrift::FieldRepetitionType::REPEATED); VELOX_CHECK_EQ(children.size(), 2); - auto childrenCopy = children; - return std::make_shared( - TypeFactory::create( - children[0]->type(), children[1]->type()), - std::move(childrenCopy), + auto type = TypeFactory::create( + children[0]->type(), children[1]->type()); + return std::make_unique( + std::move(type), + std::move(children), curSchemaIdx, // TODO: there are holes in the ids maxSchemaElementIdx, ParquetTypeWithId::kNonLeaf, // columnIdx, @@ -308,10 +308,10 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( case thrift::ConvertedType::MAP: { VELOX_CHECK_EQ(children.size(), 1); const auto& child = children[0]; - auto grandChildren = child->getChildren(); - return std::make_shared( - child->type(), - std::move(grandChildren), + auto type = child->type(); + return std::make_unique( + std::move(type), + std::move(*(ParquetTypeWithId*)child.get()).moveChildren(), curSchemaIdx, // TODO: there are holes in the ids maxSchemaElementIdx, ParquetTypeWithId::kNonLeaf, // columnIdx, @@ -335,10 +335,10 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( children.size(), 2, "children size should not be larger than 2"); if (children.size() == 1) { // child of LIST - auto childrenCopy = children; - return std::make_shared( - TypeFactory::create(children[0]->type()), - std::move(childrenCopy), + auto type = TypeFactory::create(children[0]->type()); + return std::make_unique( + std::move(type), + std::move(children), curSchemaIdx, maxSchemaElementIdx, ParquetTypeWithId::kNonLeaf, // columnIdx, @@ -349,11 +349,11 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( maxDefine); } else if (children.size() == 2) { // children of MAP - auto childrenCopy = children; - return std::make_shared( - TypeFactory::create( - children[0]->type(), children[1]->type()), - std::move(childrenCopy), + auto type = TypeFactory::create( + children[0]->type(), children[1]->type()); + return std::make_unique( + std::move(type), + std::move(children), curSchemaIdx, // TODO: there are holes in the ids maxSchemaElementIdx, ParquetTypeWithId::kNonLeaf, // columnIdx, @@ -365,10 +365,10 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( } } else { // Row type - auto childrenCopy = children; - return std::make_shared( - createRowType(children, isFileColumnNamesReadAsLowerCase()), - std::move(childrenCopy), + auto type = createRowType(children, isFileColumnNamesReadAsLowerCase()); + return std::make_unique( + std::move(type), + std::move(children), curSchemaIdx, maxSchemaElementIdx, ParquetTypeWithId::kNonLeaf, // columnIdx, @@ -386,34 +386,33 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( int32_t scale = schemaElement.__isset.scale ? schemaElement.scale : 0; int32_t type_length = schemaElement.__isset.type_length ? schemaElement.type_length : 0; - std::vector> children; + std::vector> children; const std::optional logicalType_ = schemaElement.__isset.logicalType ? std::optional(schemaElement.logicalType) : std::nullopt; - std::shared_ptr leafTypePtr = - std::make_shared( - veloxType, - std::move(children), - curSchemaIdx, - maxSchemaElementIdx, - columnIdx++, - name, - schemaElement.type, - logicalType_, - maxRepeat, - maxDefine, - precision, - scale, - type_length); + auto leafTypePtr = std::make_unique( + veloxType, + std::move(children), + curSchemaIdx, + maxSchemaElementIdx, + columnIdx++, + name, + schemaElement.type, + logicalType_, + maxRepeat, + maxDefine, + precision, + scale, + type_length); if (schemaElement.repetition_type == thrift::FieldRepetitionType::REPEATED) { // Array children.clear(); children.reserve(1); - children.push_back(leafTypePtr); - return std::make_shared( + children.push_back(std::move(leafTypePtr)); + return std::make_unique( TypeFactory::create(veloxType), std::move(children), curSchemaIdx, @@ -571,14 +570,14 @@ TypePtr ReaderBase::convertType( } } +template std::shared_ptr ReaderBase::createRowType( - std::vector> children, + const std::vector& children, bool fileColumnNamesReadAsLowerCase) { std::vector childNames; std::vector childTypes; for (auto& child : children) { - auto childName = - std::static_pointer_cast(child)->name_; + auto childName = static_cast(*child).name_; if (fileColumnNamesReadAsLowerCase) { folly::toLowerAscii(childName); } diff --git a/velox/dwio/parquet/reader/ParquetTypeWithId.cpp b/velox/dwio/parquet/reader/ParquetTypeWithId.cpp index 2846391f6f779..fc9783537aa33 100644 --- a/velox/dwio/parquet/reader/ParquetTypeWithId.cpp +++ b/velox/dwio/parquet/reader/ParquetTypeWithId.cpp @@ -34,6 +34,41 @@ bool containsList(const ParquetTypeWithId& type) { } // namespace using arrow::LevelInfo; +std::vector> +ParquetTypeWithId::moveChildren() && { + std::vector> children; + for (auto& child : getChildren()) { + auto type = child->type(); + auto id = child->id(); + auto maxId = child->maxId(); + auto column = child->column(); + auto* parquetChild = (ParquetTypeWithId*)child.get(); + auto name = parquetChild->name_; + auto parquetType = parquetChild->parquetType_; + auto logicalType = parquetChild->logicalType_; + auto maxRepeat = parquetChild->maxRepeat_; + auto maxDefine = parquetChild->maxDefine_; + auto precision = parquetChild->precision_; + auto scale = parquetChild->scale_; + auto typeLength = parquetChild->typeLength_; + children.push_back(std::make_unique( + std::move(type), + std::move(*parquetChild).moveChildren(), + id, + maxId, + column, + std::move(name), + parquetType, + logicalType, + maxRepeat, + maxDefine, + precision, + scale, + typeLength)); + } + return children; +} + bool ParquetTypeWithId::hasNonRepeatedLeaf() const { if (type()->kind() == TypeKind::ARRAY) { return false; @@ -67,7 +102,7 @@ LevelMode ParquetTypeWithId::makeLevelInfo(LevelInfo& info) const { if (isStruct) { bool isAllLists = true; for (auto i = 0; i < getChildren().size(); ++i) { - auto child = parquetChildAt(i); + auto& child = parquetChildAt(i); if (child.type()->kind() != TypeKind ::ARRAY) { isAllLists = false; } diff --git a/velox/dwio/parquet/reader/ParquetTypeWithId.h b/velox/dwio/parquet/reader/ParquetTypeWithId.h index 4fc6b347f2213..e9b41af6abbc1 100644 --- a/velox/dwio/parquet/reader/ParquetTypeWithId.h +++ b/velox/dwio/parquet/reader/ParquetTypeWithId.h @@ -36,7 +36,7 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { ParquetTypeWithId( TypePtr type, - std::vector>&& children, + std::vector>&& children, uint32_t id, uint32_t maxId, uint32_t column, @@ -74,6 +74,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { /// Fills 'info' and returns the mode for interpreting levels. LevelMode makeLevelInfo(arrow::LevelInfo& info) const; + std::vector> moveChildren() &&; + const std::string name_; const std::optional parquetType_; const std::optional logicalType_;