From 776ab24cc656d872b67939b8945602dcdbe91f24 Mon Sep 17 00:00:00 2001 From: Lu Niu Date: Thu, 4 Apr 2024 14:35:24 -0700 Subject: [PATCH] Support ENUM type in parquet reader (#8263) Summary: This PR adds ENUM type support in parquet and fixes logging Pull Request resolved: https://github.com/facebookincubator/velox/pull/8263 Reviewed By: xiaoxmeng Differential Revision: D55766009 Pulled By: mbasmanova fbshipit-source-id: fcd662b6d8dc1e9d1421f23229e9e18daf58853b --- velox/dwio/parquet/reader/ParquetReader.cpp | 13 ++++++--- .../parquet/tests/examples/enum_type.parquet | Bin 0 -> 391 bytes .../tests/reader/ParquetReaderTest.cpp | 25 ++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 velox/dwio/parquet/tests/examples/enum_type.parquet diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 6200498a7137..ceeb4ee344ae 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -528,10 +528,16 @@ TypePtr ReaderBase::convertType( VELOX_FAIL( "UTF8 converted type can only be set for thrift::Type::(FIXED_LEN_)BYTE_ARRAY"); } + case thrift::ConvertedType::ENUM: { + VELOX_CHECK_EQ( + schemaElement.type, + thrift::Type::BYTE_ARRAY, + "ENUM converted type can only be set for value of thrift::Type::BYTE_ARRAY"); + return VARCHAR(); + } case thrift::ConvertedType::MAP: case thrift::ConvertedType::MAP_KEY_VALUE: case thrift::ConvertedType::LIST: - case thrift::ConvertedType::ENUM: case thrift::ConvertedType::TIME_MILLIS: case thrift::ConvertedType::TIME_MICROS: case thrift::ConvertedType::JSON: @@ -539,7 +545,7 @@ TypePtr ReaderBase::convertType( case thrift::ConvertedType::INTERVAL: default: VELOX_FAIL( - "Unsupported Parquet SchemaElement converted type: ", + "Unsupported Parquet SchemaElement converted type: {}", schemaElement.converted_type); } } else { @@ -565,7 +571,8 @@ TypePtr ReaderBase::convertType( } default: - VELOX_FAIL("Unknown Parquet SchemaElement type: ", schemaElement.type); + VELOX_FAIL( + "Unknown Parquet SchemaElement type: {}", schemaElement.type); } } } diff --git a/velox/dwio/parquet/tests/examples/enum_type.parquet b/velox/dwio/parquet/tests/examples/enum_type.parquet new file mode 100644 index 0000000000000000000000000000000000000000..90b53745ce921e8ce7212cc38104cf3e1e3b3e3b GIT binary patch literal 391 zcmYk3%}T>S5P-Ka#0aHO>4YpSm`g*I8k(g4c(5Ksq}b9IFzI&F2AZ^{sd_BpO%Sg> zhY#Svn+K0QdGQ%UJnE+P;K0uA{CqnzOt;lDK?DMa@cMK6kUrpsh|tcRjMhRu$C-Y9 z52fDOeG9!b@kfRS0Kqmi;rV^>BomN0BmpP>VSUGOyy3tT(4+1zw>X2d0#-MUT0O3h zI~@?P!kt73VZr=P42T?Bs^l}Yx{$IEAg3%-u!j;x(q+!&l5W)ORpiGk zGJ?r~jg8O^n7YZX-C#OqNIm^)Z$F5%OBT)iNvN5IX&IJQ=+FJ}K&#c5U840;!?x>H iMk^MzspnRyUGCSIXWN!(TNZ7UcQGM+m@j_1dnumberOfRows(), 3ULL); + + auto rowType = reader->typeWithId(); + EXPECT_EQ(rowType->type()->kind(), TypeKind::ROW); + EXPECT_EQ(rowType->size(), 1ULL); + + EXPECT_EQ(rowType->childAt(0)->type()->kind(), TypeKind::VARCHAR); + + auto fileSchema = ROW({"test"}, {VARCHAR()}); + auto rowReaderOpts = getReaderOpts(fileSchema); + rowReaderOpts.setScanSpec(makeScanSpec(fileSchema)); + auto rowReader = reader->createRowReader(rowReaderOpts); + + auto expected = + makeRowVector({makeFlatVector({"FOO", "BAR", "FOO"})}); + + assertReadWithReaderAndExpected(fileSchema, *rowReader, expected, *leafPool_); +}