From 91fef9173b0101ae999b6695a34d32ae87727db4 Mon Sep 17 00:00:00 2001 From: yan ma Date: Thu, 15 Aug 2024 00:14:34 +0800 Subject: [PATCH] fix parsing array struct when repetition_type is 'REPEATED' --- velox/dwio/parquet/reader/ParquetReader.cpp | 37 +++++++++++++ .../examples/proto-struct-with-array.parquet | Bin 0 -> 1576 bytes .../tests/reader/ParquetReaderTest.cpp | 50 ++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 velox/dwio/parquet/tests/examples/proto-struct-with-array.parquet diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 635366f19f136..a414d19f18fad 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -464,6 +464,43 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( maxDefine, isOptional, isRepeated); + } else { + // Row type + // To support list backward compatibility, need create a new row type + // instance and set all the fields as its children. + auto childrenRowType = + createRowType(children, isFileColumnNamesReadAsLowerCase()); + std::vector> + rowChildren; + // In this legacy case, there is no middle layer between "array" + // node and the children nodes. Below creates this dummy middle + // layer to mimic the non-legacy case and fill the gap. + rowChildren.emplace_back(std::make_unique( + childrenRowType, + std::move(children), + curSchemaIdx, + maxSchemaElementIdx, + ParquetTypeWithId::kNonLeaf, + "dummy", + std::nullopt, + std::nullopt, + maxRepeat, + maxDefine, + isOptional, + isRepeated)); + return std::make_unique( + TypeFactory::create(childrenRowType), + std::move(rowChildren), + curSchemaIdx, + maxSchemaElementIdx, + ParquetTypeWithId::kNonLeaf, // columnIdx, + std::move(name), + std::nullopt, + std::nullopt, + maxRepeat, + maxDefine, + isOptional, + isRepeated); } } else { // Row type diff --git a/velox/dwio/parquet/tests/examples/proto-struct-with-array.parquet b/velox/dwio/parquet/tests/examples/proto-struct-with-array.parquet new file mode 100644 index 0000000000000000000000000000000000000000..325a8370ad20ec31010fde0c816895da9f5dbd27 GIT binary patch literal 1576 zcmcIlO>5gg5S?rsql%Icnq5|qgD{N<#x?oCY2!n{ZAEKv64iDOJsH{F!~)4uB-v0( z@BJM;_owufA5^-l4@Z{ekVAVh>zOxi-n`LDMyq>_0q^0x8b74l?^SjJrp%q&06h8-y4iMdSJ@MDH4c~HjX3j($=&sN1 zW|q&!OYxG3d&~^8@dlzhDa$1b0`rz(6tkBD*J153G=T1;gzF$B0g1WSKnPOy6M$~KQ|W5 z5A#DO!$$Zca>TK`;67WBib&?m76{4rqTmNgwH)UCc)*uPhjcg;fc!=Tfl{N?GyS_6 z3+tZPeSOS=k#BjS>(f75Q`2EhwX*hMsK_@Kv&ZT;SydBky3mDR6_J}cL*_TtV}7>H zA+wumr}b9v46coS`}(TY;qmaR$9wg^82X@n)jvIvzps*~J`|Fl + // optionalMessage:struct + // requiredMessage:struct + // repeatedMessage:array> + const std::string sample( + getExampleFilePath("proto-struct-with-array.parquet")); + + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto reader = createReader(sample, readerOptions); + EXPECT_EQ(reader->numberOfRows(), 1ULL); + auto type = reader->typeWithId(); + EXPECT_EQ(type->size(), 6ULL); + auto col6_type = type->childAt(5); + EXPECT_EQ(col6_type->type()->kind(), TypeKind::ARRAY); + auto col6_1_type = col6_type->childAt(0); + EXPECT_EQ(col6_1_type->type()->kind(), TypeKind::ROW); + + auto outputRowType = + ROW({"optionalPrimitive", + "requiredPrimitive", + "repeatedPrimitive", + "optionalMessage", + "requiredMessage", + "repeatedMessage"}, + {INTEGER(), + INTEGER(), + ARRAY(INTEGER()), + ROW({"someId"}, {INTEGER()}), + ROW({"someId"}, {INTEGER()}), + ARRAY(ROW({"someId"}, {INTEGER()}))}); + auto rowReaderOpts = getReaderOpts(outputRowType); + rowReaderOpts.setScanSpec(makeScanSpec(outputRowType)); + auto rowReader = reader->createRowReader(rowReaderOpts); + VectorPtr result = BaseVector::create(outputRowType, 0, &*leafPool_); + + ASSERT_TRUE(rowReader->next(1, result)); + // data: 10, 9, , null, {9}, 2 elements starting at 0 {{9}, {10}}} + auto structArray = result->as()->childAt(5)->as(); + auto structEle = structArray->elements() + ->as() + ->childAt(0) + ->asFlatVector() + ->valueAt(0); + EXPECT_EQ(structEle, 9); +} + TEST_F(ParquetReaderTest, readSampleBigintRangeFilter) { // Read sample.parquet with the int filter "a BETWEEN 16 AND 20". FilterMap filters;