From ede6a69dd0a54c088fd4d91722793c3ed9d775bb Mon Sep 17 00:00:00 2001 From: kevincmchen Date: Wed, 31 Jul 2024 19:21:38 -0700 Subject: [PATCH] support to read fixed length binary as string. (#10621) Summary: This is a followup of https://github.com/facebookincubator/velox/issues/10399, support to read fixed length binary as string. Pull Request resolved: https://github.com/facebookincubator/velox/pull/10621 Reviewed By: Yuhta Differential Revision: D60527079 Pulled By: kagamiori fbshipit-source-id: a13f3bb963c57e98d494fb9fc194f2a988ba2bee --- velox/dwio/parquet/reader/PageReader.cpp | 2 +- .../dwio/parquet/tests/examples/uuid.parquet | Bin 0 -> 730 bytes .../tests/reader/ParquetReaderTest.cpp | 28 ++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 velox/dwio/parquet/tests/examples/uuid.parquet diff --git a/velox/dwio/parquet/reader/PageReader.cpp b/velox/dwio/parquet/reader/PageReader.cpp index 004b0f6b801c..cf46fdb58184 100644 --- a/velox/dwio/parquet/reader/PageReader.cpp +++ b/velox/dwio/parquet/reader/PageReader.cpp @@ -666,7 +666,7 @@ void PageReader::makeDecoder() { pageData_, pageData_ + encodedDataSize_); break; case thrift::Type::FIXED_LEN_BYTE_ARRAY: - if (type_->type()->isVarbinary()) { + if (type_->type()->isVarbinary() || type_->type()->isVarchar()) { stringDecoder_ = std::make_unique( pageData_, pageData_ + encodedDataSize_, type_->typeLength_); } else { diff --git a/velox/dwio/parquet/tests/examples/uuid.parquet b/velox/dwio/parquet/tests/examples/uuid.parquet new file mode 100644 index 0000000000000000000000000000000000000000..91ca9d2061fd979c7822c1833e80cdec68fd3424 GIT binary patch literal 730 zcma))&5G1O5XU=Fh9J1GXoo~{7{WTNFddSvB%Lf?)`KkKVcDyQbo#>$&d1u5Ssj;o z0r9X$-@u!XA)Y;X@*qBdk6>s0SUm`H>#F)w_4`-#;r>yE5&rDpFAo0n@$0+m5gyot zhdqQQ-=7tES>;8}36!9T5TYVXrVU|b$|z@r)SOewin_=(fm+w3fifdH6%CP;<|5Cy zW>BQSKyVJEC{;zIC@P|rVnl*SE0$8mD+ssUFxc5h8SHEVhTYIdD2#j$pS$*jY+`(H z>wf!E$%kt8v^&1H>eRk5e08M6+(_&qyoC=i3a%}et$d=d8 zUBs?G@L#+BxjXXimS}e)uFXtZaC->LjFA7~3ikcCE`INze%fH<*@-u;w{dXegq!C} z)g}DY%#(#UR#P6lan9myb)w?E*jzY^$MK9$`}qIvt>(h;BHnv+_FI0r2V4Hx@=t5s zFY>bUKhj0}%1(oFy?(q#!S)N&cFH8vxm1&+Ckd|hyeN3^3!hBQ@Kl*aJD&}+B%=u( w?l#MIB8QAANJW*_09C0(K?MlSO96Sqlm?&~P%7&DP8db@#@`%-4(+f00m_lWlmGw# literal 0 HcmV?d00001 diff --git a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp index 1a4ef241bd3c..743f14682627 100644 --- a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp @@ -1180,6 +1180,34 @@ TEST_F(ParquetReaderTest, readBinaryAsStringFromNation) { 0)); } +TEST_F(ParquetReaderTest, readFixedLenBinaryAsStringFromUuid) { + const std::string filename("uuid.parquet"); + const std::string sample(getExampleFilePath(filename)); + + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto outputRowType = ROW({"uuid_field"}, {VARCHAR()}); + + readerOptions.setFileSchema(outputRowType); + auto reader = createReader(sample, readerOptions); + EXPECT_EQ(reader->numberOfRows(), 3ULL); + auto rowType = reader->typeWithId(); + EXPECT_EQ(rowType->type()->kind(), TypeKind::ROW); + EXPECT_EQ(rowType->size(), 1ULL); + EXPECT_EQ(rowType->childAt(0)->type()->kind(), TypeKind::VARCHAR); + + auto rowReaderOpts = getReaderOpts(outputRowType); + rowReaderOpts.setScanSpec(makeScanSpec(outputRowType)); + auto rowReader = reader->createRowReader(rowReaderOpts); + + auto expected = std::string("5468454a-363f-ccc8-7d0b-76072a75dfaa"); + VectorPtr result = BaseVector::create(outputRowType, 0, &(*leafPool_)); + rowReader->next(1, result); + EXPECT_EQ( + expected, + result->as()->childAt(0)->asFlatVector()->valueAt( + 0)); +} + TEST_F(ParquetReaderTest, testV2PageWithZeroMaxDefRep) { // enum_type.parquet contains 1 column (ENUM) with 3 rows. const std::string sample(getExampleFilePath("v2_page.parquet"));