Skip to content

Commit

Permalink
Support ENUM type in parquet reader (facebookincubator#8263)
Browse files Browse the repository at this point in the history
Summary:
This PR adds ENUM type support in parquet and fixes logging

Pull Request resolved: facebookincubator#8263

Reviewed By: xiaoxmeng

Differential Revision: D55766009

Pulled By: mbasmanova

fbshipit-source-id: fcd662b6d8dc1e9d1421f23229e9e18daf58853b
  • Loading branch information
Lu Niu authored and facebook-github-bot committed Apr 4, 2024
1 parent 08dbd0a commit 776ab24
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 3 deletions.
13 changes: 10 additions & 3 deletions velox/dwio/parquet/reader/ParquetReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,18 +528,24 @@ TypePtr ReaderBase::convertType(
VELOX_FAIL(
"UTF8 converted type can only be set for thrift::Type::(FIXED_LEN_)BYTE_ARRAY");
}
case thrift::ConvertedType::ENUM: {
VELOX_CHECK_EQ(
schemaElement.type,
thrift::Type::BYTE_ARRAY,
"ENUM converted type can only be set for value of thrift::Type::BYTE_ARRAY");
return VARCHAR();
}
case thrift::ConvertedType::MAP:
case thrift::ConvertedType::MAP_KEY_VALUE:
case thrift::ConvertedType::LIST:
case thrift::ConvertedType::ENUM:
case thrift::ConvertedType::TIME_MILLIS:
case thrift::ConvertedType::TIME_MICROS:
case thrift::ConvertedType::JSON:
case thrift::ConvertedType::BSON:
case thrift::ConvertedType::INTERVAL:
default:
VELOX_FAIL(
"Unsupported Parquet SchemaElement converted type: ",
"Unsupported Parquet SchemaElement converted type: {}",
schemaElement.converted_type);
}
} else {
Expand All @@ -565,7 +571,8 @@ TypePtr ReaderBase::convertType(
}

default:
VELOX_FAIL("Unknown Parquet SchemaElement type: ", schemaElement.type);
VELOX_FAIL(
"Unknown Parquet SchemaElement type: {}", schemaElement.type);
}
}
}
Expand Down
Binary file added velox/dwio/parquet/tests/examples/enum_type.parquet
Binary file not shown.
25 changes: 25 additions & 0 deletions velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -944,3 +944,28 @@ TEST_F(ParquetReaderTest, testEmptyRowGroups) {

assertReadWithReaderAndExpected(fileSchema, *rowReader, expected, *leafPool_);
}

TEST_F(ParquetReaderTest, testEnumType) {
// enum_type.parquet contains 1 column (ENUM) with 3 rows.
const std::string sample(getExampleFilePath("enum_type.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 3ULL);

auto rowType = reader->typeWithId();
EXPECT_EQ(rowType->type()->kind(), TypeKind::ROW);
EXPECT_EQ(rowType->size(), 1ULL);

EXPECT_EQ(rowType->childAt(0)->type()->kind(), TypeKind::VARCHAR);

auto fileSchema = ROW({"test"}, {VARCHAR()});
auto rowReaderOpts = getReaderOpts(fileSchema);
rowReaderOpts.setScanSpec(makeScanSpec(fileSchema));
auto rowReader = reader->createRowReader(rowReaderOpts);

auto expected =
makeRowVector({makeFlatVector<StringView>({"FOO", "BAR", "FOO"})});

assertReadWithReaderAndExpected(fileSchema, *rowReader, expected, *leafPool_);
}

0 comments on commit 776ab24

Please sign in to comment.