Skip to content

Commit

Permalink
fix parsing array struct when repetition_type is 'REPEATED'
Browse files Browse the repository at this point in the history
  • Loading branch information
yma11 committed Aug 17, 2024
1 parent 453db2d commit 419b10a
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 0 deletions.
37 changes: 37 additions & 0 deletions velox/dwio/parquet/reader/ParquetReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,43 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
maxDefine,
isOptional,
isRepeated);
} else {
// Row type
// To support list backward compatibility, need create a new row type
// instance and set all the fields as its children.
auto childrenRowType =
createRowType(children, isFileColumnNamesReadAsLowerCase());
std::vector<std::unique_ptr<ParquetTypeWithId::TypeWithId>>
rowChildren;
// In this legacy case, there is no middle layer between "array"
// node and the children nodes. Below creates this dummy middle
// layer to mimic the non-legacy case and fill the gap.
rowChildren.emplace_back(std::make_unique<ParquetTypeWithId>(
childrenRowType,
std::move(children),
curSchemaIdx,
maxSchemaElementIdx,
ParquetTypeWithId::kNonLeaf,
"dummy",
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine,
isOptional,
isRepeated));
return std::make_unique<ParquetTypeWithId>(
TypeFactory<TypeKind::ARRAY>::create(childrenRowType),
std::move(rowChildren),
curSchemaIdx,
maxSchemaElementIdx,
ParquetTypeWithId::kNonLeaf, // columnIdx,
std::move(name),
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine,
isOptional,
isRepeated);
}
} else {
// Row type
Expand Down
Binary file not shown.
50 changes: 50 additions & 0 deletions velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,56 @@ TEST_F(ParquetReaderTest, parseMapKeyValueAsMap) {
assertReadWithReaderAndExpected(fileSchema, *rowReader, expected, *leafPool_);
}

TEST_F(ParquetReaderTest, parseRowArrayTest) {
// schema:
// optionalPrimitive:int
// requiredPrimitive:int
// repeatedPrimitive:array<int>
// optionalMessage:struct<someId:int>
// requiredMessage:struct<someId:int>
// repeatedMessage:array<struct<someId:int>>
const std::string sample(
getExampleFilePath("proto-struct-with-array.parquet"));

dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 1ULL);
auto type = reader->typeWithId();
EXPECT_EQ(type->size(), 6ULL);
auto col6_type = type->childAt(5);
EXPECT_EQ(col6_type->type()->kind(), TypeKind::ARRAY);
auto col6_1_type = col6_type->childAt(0);
EXPECT_EQ(col6_1_type->type()->kind(), TypeKind::ROW);

auto outputRowType =
ROW({"optionalPrimitive",
"requiredPrimitive",
"repeatedPrimitive",
"optionalMessage",
"requiredMessage",
"repeatedMessage"},
{INTEGER(),
INTEGER(),
ARRAY(INTEGER()),
ROW({"someId"}, {INTEGER()}),
ROW({"someId"}, {INTEGER()}),
ARRAY(ROW({"someId"}, {INTEGER()}))});
auto rowReaderOpts = getReaderOpts(outputRowType);
rowReaderOpts.setScanSpec(makeScanSpec(outputRowType));
auto rowReader = reader->createRowReader(rowReaderOpts);
VectorPtr result = BaseVector::create(outputRowType, 0, &*leafPool_);

ASSERT_TRUE(rowReader->next(1, result));
// data: 10, 9, <empty>, null, {9}, 2 elements starting at 0 {{9}, {10}}}
auto structArray = result->as<RowVector>()->childAt(5)->as<ArrayVector>();
auto structEle = structArray->elements()
->as<RowVector>()
->childAt(0)
->asFlatVector<int32_t>()
->valueAt(0);
EXPECT_EQ(structEle, 9);
}

TEST_F(ParquetReaderTest, readSampleBigintRangeFilter) {
// Read sample.parquet with the int filter "a BETWEEN 16 AND 20".
FilterMap filters;
Expand Down

0 comments on commit 419b10a

Please sign in to comment.