diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index c4d6966c9c95..3e010d831967 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -235,6 +235,8 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( auto& schema = fileMetaData_->schema; uint32_t curSchemaIdx = schemaIdx; auto& schemaElement = schema[curSchemaIdx]; + bool isRepeated = false; + bool isOptional = false; if (schemaElement.__isset.repetition_type) { if (schemaElement.repetition_type != @@ -244,6 +246,11 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( if (schemaElement.repetition_type == thrift::FieldRepetitionType::REPEATED) { maxRepeat++; + isRepeated = true; + } + if (schemaElement.repetition_type == + thrift::FieldRepetitionType::OPTIONAL) { + isOptional = true; } } @@ -300,7 +307,9 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat, - maxDefine); + maxDefine, + isOptional, + isRepeated); } // For backward-compatibility, a group annotated with MAP_KEY_VALUE @@ -313,6 +322,12 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( VELOX_CHECK_EQ(children.size(), 1); const auto& child = children[0]; auto type = child->type(); + isRepeated = true; + // This level will not have the "isRepeated" info in the parquet + // schema since parquet schema will have a child layer which will have + // the "repeated info" which we are ignoring here, hence we set the + // isRepeated to true eg + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists return std::make_unique( std::move(type), std::move(*(ParquetTypeWithId*)child.get()).moveChildren(), @@ -323,7 +338,9 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat + 1, - maxDefine); + maxDefine, + isOptional, + isRepeated); } default: @@ -354,7 +371,9 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat, - maxDefine); + maxDefine, + isOptional, + isRepeated); } else if ( schema[parentSchemaIdx].converted_type == thrift::ConvertedType::MAP || @@ -374,7 +393,9 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat, - maxDefine); + maxDefine, + isOptional, + isRepeated); } } else { // Row type @@ -389,7 +410,9 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat, - maxDefine); + maxDefine, + isOptional, + isRepeated); } } } else { // leaf node @@ -415,6 +438,8 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( logicalType_, maxRepeat, maxDefine, + isOptional, + isRepeated, precision, scale, type_length); @@ -430,12 +455,14 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( std::move(children), curSchemaIdx, maxSchemaElementIdx, - columnIdx++, + columnIdx - 1, // was already incremented for leafTypePtr std::move(name), std::nullopt, std::nullopt, maxRepeat, - maxDefine - 1); + maxDefine - 1, + isOptional, + isRepeated); } return leafTypePtr; } @@ -631,6 +658,9 @@ int64_t ReaderBase::rowGroupUncompressedSize( int32_t rowGroupIndex, const dwio::common::TypeWithId& type) const { if (type.column() != ParquetTypeWithId::kNonLeaf) { + VELOX_CHECK_LT(rowGroupIndex, fileMetaData_->row_groups.size()); + VELOX_CHECK_LT( + type.column(), fileMetaData_->row_groups[rowGroupIndex].columns.size()); return fileMetaData_->row_groups[rowGroupIndex] .columns[type.column()] .meta_data.total_uncompressed_size; diff --git a/velox/dwio/parquet/reader/ParquetTypeWithId.cpp b/velox/dwio/parquet/reader/ParquetTypeWithId.cpp index 32f415837ea2..97891920ff13 100644 --- a/velox/dwio/parquet/reader/ParquetTypeWithId.cpp +++ b/velox/dwio/parquet/reader/ParquetTypeWithId.cpp @@ -48,6 +48,8 @@ ParquetTypeWithId::moveChildren() && { auto logicalType = parquetChild->logicalType_; auto maxRepeat = parquetChild->maxRepeat_; auto maxDefine = parquetChild->maxDefine_; + auto isOptional = parquetChild->isOptional_; + auto isRepeated = parquetChild->isRepeated_; auto precision = parquetChild->precision_; auto scale = parquetChild->scale_; auto typeLength = parquetChild->typeLength_; @@ -62,6 +64,8 @@ ParquetTypeWithId::moveChildren() && { std::move(logicalType), maxRepeat, maxDefine, + isOptional, + isRepeated, precision, scale, typeLength)); @@ -86,15 +90,14 @@ bool ParquetTypeWithId::hasNonRepeatedLeaf() const { } LevelMode ParquetTypeWithId::makeLevelInfo(LevelInfo& info) const { - int16_t repeatedAncestor = 0; - for (auto parent = parquetParent(); parent; - parent = parent->parquetParent()) { - if (parent->type()->kind() == TypeKind::ARRAY || - parent->type()->kind() == TypeKind::MAP) { - repeatedAncestor = parent->maxDefine_; - break; + int repeatedAncestor = maxDefine_; + auto node = this; + do { + if (node->isOptional_) { + repeatedAncestor--; } - } + node = node->parquetParent(); + } while (node && !node->isRepeated_); bool isList = type()->kind() == TypeKind::ARRAY; bool isStruct = type()->kind() == TypeKind::ROW; bool isMap = type()->kind() == TypeKind::MAP; diff --git a/velox/dwio/parquet/reader/ParquetTypeWithId.h b/velox/dwio/parquet/reader/ParquetTypeWithId.h index 8959633fdbcb..e593c23be6ac 100644 --- a/velox/dwio/parquet/reader/ParquetTypeWithId.h +++ b/velox/dwio/parquet/reader/ParquetTypeWithId.h @@ -45,6 +45,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { std::optional logicalType, uint32_t maxRepeat, uint32_t maxDefine, + bool isOptional, + bool isRepeated, int32_t precision = 0, int32_t scale = 0, int32_t typeLength = 0) @@ -54,6 +56,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { logicalType_(std::move(logicalType)), maxRepeat_(maxRepeat), maxDefine_(maxDefine), + isOptional_(isOptional), + isRepeated_(isRepeated), precision_(precision), scale_(scale), typeLength_(typeLength) {} @@ -81,6 +85,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { const std::optional logicalType_; const uint32_t maxRepeat_; const uint32_t maxDefine_; + const bool isOptional_; + const bool isRepeated_; const int32_t precision_; const int32_t scale_; const int32_t typeLength_; diff --git a/velox/dwio/parquet/tests/examples/array_of_array1.parquet b/velox/dwio/parquet/tests/examples/array_of_array1.parquet new file mode 100644 index 000000000000..4c6e9d274167 Binary files /dev/null and b/velox/dwio/parquet/tests/examples/array_of_array1.parquet differ diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index 1ec63e3e793a..7b2d1e24c4d7 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -111,6 +111,19 @@ class ParquetTableScanTest : public HiveConnectorTestBase { createDuckDbTable({data}); } + void loadDataWithRowType(const std::string& filePath, RowVectorPtr data) { + splits_ = {makeSplit(filePath)}; + auto pool = facebook::velox::memory::memoryManager()->addLeafPool(); + dwio::common::ReaderOptions readerOpts{pool.get()}; + auto reader = std::make_unique( + std::make_unique( + std::make_shared(filePath), + readerOpts.getMemoryPool()), + readerOpts); + rowType_ = reader->rowType(); + createDuckDbTable({data}); + } + std::string getExampleFilePath(const std::string& fileName) { return facebook::velox::test::getDataFilePath( "velox/dwio/parquet/tests/reader", "../examples/" + fileName); @@ -303,9 +316,8 @@ TEST_F(ParquetTableScanTest, singleRowStruct) { } // Core dump and incorrect result are fixed. -TEST_F(ParquetTableScanTest, DISABLED_array) { - auto vector = makeArrayVector({{1, 2, 3}}); - +TEST_F(ParquetTableScanTest, array) { + auto vector = makeArrayVector({}); loadData( getExampleFilePath("old_repeated_int.parquet"), ROW({"repeatedInt"}, {ARRAY(INTEGER())}), @@ -316,12 +328,11 @@ TEST_F(ParquetTableScanTest, DISABLED_array) { })); assertSelectWithFilter( - {"repeatedInt"}, {}, "", "SELECT repeatedInt FROM tmp"); + {"repeatedInt"}, {}, "", "SELECT UNNEST(array[array[1,2,3]])"); } // Optional array with required elements. -// Incorrect result. -TEST_F(ParquetTableScanTest, DISABLED_optArrayReqEle) { +TEST_F(ParquetTableScanTest, optArrayReqEle) { auto vector = makeArrayVector({}); loadData( @@ -341,8 +352,7 @@ TEST_F(ParquetTableScanTest, DISABLED_optArrayReqEle) { } // Required array with required elements. -// Core dump is fixed, but the result is incorrect. -TEST_F(ParquetTableScanTest, DISABLED_reqArrayReqEle) { +TEST_F(ParquetTableScanTest, reqArrayReqEle) { auto vector = makeArrayVector({}); loadData( @@ -362,8 +372,7 @@ TEST_F(ParquetTableScanTest, DISABLED_reqArrayReqEle) { } // Required array with optional elements. -// Incorrect result. -TEST_F(ParquetTableScanTest, DISABLED_reqArrayOptEle) { +TEST_F(ParquetTableScanTest, reqArrayOptEle) { auto vector = makeArrayVector({}); loadData( @@ -382,22 +391,39 @@ TEST_F(ParquetTableScanTest, DISABLED_reqArrayOptEle) { "SELECT UNNEST(array[array['a', null], array[], array[null, 'b']])"); } +TEST_F(ParquetTableScanTest, arrayOfArrayTest) { + auto vector = makeArrayVector({}); + + loadDataWithRowType( + getExampleFilePath("array_of_array1.parquet"), + makeRowVector( + {"_1"}, + { + vector, + })); + + assertSelectWithFilter( + {"_1"}, + {}, + "", + "SELECT UNNEST(array[null, array[array['g', 'h'], null]])"); +} + // Required array with legacy format. -// Incorrect result. -TEST_F(ParquetTableScanTest, DISABLED_reqArrayLegacy) { +TEST_F(ParquetTableScanTest, reqArrayLegacy) { auto vector = makeArrayVector({}); loadData( getExampleFilePath("array_3.parquet"), - ROW({"_1"}, {ARRAY(VARCHAR())}), + ROW({"element"}, {ARRAY(VARCHAR())}), makeRowVector( - {"_1"}, + {"element"}, { vector, })); assertSelectWithFilter( - {"_1"}, + {"element"}, {}, "", "SELECT UNNEST(array[array['a', 'b'], array[], array['c', 'd']])");