From 94f52ecb85c555c0431d76a630aab86dba21fb41 Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Thu, 21 Mar 2024 06:13:14 -0700 Subject: [PATCH] Fix read parquet for different encodings across row groups (#9129) Summary: When the parquet file contains a column that uses both plain and dictionary encodings across row groups, the output after filter can be wrong. The root cause is that `returnReaderNulls_` in `SelectiveColumnReader` may not be reset when the column encoding switches from dictionary encoding to plain encoding. Here is the parquet metadata for the test file to reproduce this issue. `n_2` is dictionary encoded in row group 2 and is plain encoded in row group 3. ``` file: file:/home/sparkuser/github/oap-project/velox/velox/dwio/parquet/tests/examples/different_encodings_with_filter.parquet creator: parquet-cpp-arrow version 15.0.0 extra: ARROW:schema = /////9gAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAMAAABwAAAAMAAAAAQAAACs////AAABBRAAAAAYAAAABAAAAAAAAAADAAAAbl8yAAQABAAEAAAA1P///wAAAQIQAAAAFAAAAAQAAAAAAAAAAwAAAG5fMQDE////AAAAASAAAAAQABQACAAGAAcADAAAABAAEAAAAAAAAQIQAAAAHAAAAAQAAAAAAAAAAwAAAG5fMAAIAAwACAAHAAgAAAAAAAABIAAAAAAAAAA= file schema: schema -------------------------------------------------------------------------------- n_0: OPTIONAL INT32 R:0 D:1 n_1: OPTIONAL INT32 R:0 D:1 n_2: OPTIONAL BINARY O:UTF8 R:0 D:1 row group 1: RC:5 TS:185 OFFSET:4 -------------------------------------------------------------------------------- n_0: INT32 SNAPPY DO:0 FPO:4 SZ:72/71/0.99 VC:5 ENC:RLE,PLAIN ST:[min: 1, max: 5, num_nulls: 0] n_1: INT32 SNAPPY DO:0 FPO:143 SZ:65/71/1.09 VC:5 ENC:RLE,PLAIN ST:[min: 1, max: 2, num_nulls: 0] n_2: BINARY SNAPPY DO:0 FPO:276 SZ:45/43/0.96 VC:5 ENC:RLE,PLAIN ST:[min: A, max: B, num_nulls: 3] row group 2: RC:5 TS:202 OFFSET:369 -------------------------------------------------------------------------------- n_0: INT32 SNAPPY DO:0 FPO:369 SZ:72/71/0.99 VC:5 ENC:RLE,PLAIN ST:[min: 1, max: 5, num_nulls: 0] n_1: INT32 SNAPPY DO:0 FPO:509 SZ:65/71/1.09 VC:5 ENC:RLE,PLAIN ST:[min: 1, max: 2, num_nulls: 0] n_2: BINARY SNAPPY DO:642 FPO:668 SZ:64/60/0.94 VC:5 ENC:RLE,RLE_DICTIONARY,PLAIN ST:[min: A, max: B, num_nulls: 3] row group 3: RC:5 TS:195 OFFSET:766 -------------------------------------------------------------------------------- n_0: INT32 SNAPPY DO:0 FPO:766 SZ:73/71/0.97 VC:5 ENC:RLE,PLAIN ST:[min: 6, max: 10, num_nulls: 0] n_1: INT32 SNAPPY DO:0 FPO:907 SZ:72/71/0.99 VC:5 ENC:RLE,PLAIN ST:[min: 1, max: 3, num_nulls: 0] n_2: BINARY SNAPPY DO:0 FPO:1047 SZ:55/53/0.96 VC:5 ENC:RLE,PLAIN ST:[min: F, max: J, num_nulls: 1] ``` Pull Request resolved: https://github.com/facebookincubator/velox/pull/9129 Reviewed By: mbasmanova Differential Revision: D55018054 Pulled By: Yuhta fbshipit-source-id: 83dbab02575321930262628ea60cd4ad3e7b8892 --- velox/dwio/common/SelectiveColumnReader.cpp | 1 + .../different_encodings_with_filter.parquet | Bin 0 -> 2191 bytes .../tests/reader/ParquetReaderTest.cpp | 26 ++++++++++++++++++ 3 files changed, 27 insertions(+) create mode 100644 velox/dwio/parquet/tests/examples/different_encodings_with_filter.parquet diff --git a/velox/dwio/common/SelectiveColumnReader.cpp b/velox/dwio/common/SelectiveColumnReader.cpp index 25aff8eb42c3..cfc9973e2ebf 100644 --- a/velox/dwio/common/SelectiveColumnReader.cpp +++ b/velox/dwio/common/SelectiveColumnReader.cpp @@ -111,6 +111,7 @@ void SelectiveColumnReader::prepareNulls( } } } + returnReaderNulls_ = false; if (resultNulls_ && resultNulls_->unique() && resultNulls_->capacity() >= bits::nbytes(numRows) + simd::kPadding) { // Clear whole capacity because future uses could hit diff --git a/velox/dwio/parquet/tests/examples/different_encodings_with_filter.parquet b/velox/dwio/parquet/tests/examples/different_encodings_with_filter.parquet new file mode 100644 index 0000000000000000000000000000000000000000..15097cc3061c6580a474aed81e3c9909dae5b830 GIT binary patch literal 2191 zcmcgu%}*0S6rb7ME@i1&NoU+8jp+dM!J<}gh7ga0 z5Mw+Tk1p8b0`d*Wy~_};YL889IdH*UIb_Pw|Bo8SA|bXitQjIc2_KFB(?a4-6D;A zDgizl;uhP+=p~9sY^Cxn69+x24!p_lh_6lFqy?ZdQ5!p!!s zuD{K;!*a}DWDX;}&{+LCwa^T`jAI?LsxRYo64G60*TJ^Lb`KVyOx&z$yH*8W@i$cb z@Wi&4kGBPbcnI_vh^%h630-O5@t~%@9%}?&FyrMQ0b>X}G)50=L*XVy&cNZ)y{|Peo4wy2Lrfep6kpDnT;ig@7`{IXTr>`H(b};Q7-L3a z#;qb<@aYVmtkPg#z~g36%%Ye>u^>L4|8p1~^Sbz+4Pyv1WG7fG6W`rjeYo~?g}G3i z0ZV;5;-?Z}nU-yIfGp=PZmVfmukRhfmv#{4K_D*u!`2oCCw)XJoZ{{nnLMbNb(`OL$$bNw_K~ z;JPMdQ${s8BV|oaDwm8RDVNYWDdjw_pm$c#&=ob&u3l15lbZ;Qj;M+rc^d`n>s(X# zaPP^)Ydjk4)2Nr0VG|-%<`o|xfr^q!pl`PPV5u@S-Nu*lhVlSN1^LZNxhfTS2Nd0H zyt($PYC+M#Q}Hx$KfKZk8oZGTvi0&+;ukhn8qZ#=Js(=#*ce)AG}fDimurp9N9*-M XsXRO~JQ9R3&8;8lhW`-D@K@wF2j>CD literal 0 HcmV?d00001 diff --git a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp index e98fcf78a202..a64df210a48a 100644 --- a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp @@ -765,6 +765,32 @@ TEST_F(ParquetReaderTest, varcharFilters) { "nation.parquet", rowType, std::move(filters), expected); } +TEST_F(ParquetReaderTest, readDifferentEncodingsWithFilter) { + FilterMap filters; + filters.insert({"n_1", exec::equal(1)}); + auto rowType = ROW({"n_0", "n_1", "n_2"}, {INTEGER(), INTEGER(), VARCHAR()}); + auto expected = makeRowVector({ + makeFlatVector({1, 2, 3, 4, 1, 2, 3, 4, 6, 9}), + makeFlatVector(10, [](auto /*row*/) { return 1; }), + makeNullableFlatVector( + {"A", + "B", + std::nullopt, + std::nullopt, + "A", + "B", + std::nullopt, + std::nullopt, + "F", + std::nullopt}), + }); + assertReadWithFilters( + "different_encodings_with_filter.parquet", + rowType, + std::move(filters), + expected); +} + // This test is to verify filterRowGroups() doesn't throw the fileOffset Velox // check failure TEST_F(ParquetReaderTest, filterRowGroups) {