Skip to content

Commit

Permalink
Fix Parquet read with an isNull filter on nested array (#10890)
Browse files Browse the repository at this point in the history
Summary:
Below failure occurs when selecting a struct column from Parquet filtering
using an isNull filter on nested array and the filter is extracted to subfield
filters in 'extractFiltersFromRemainingFilter'.

```
velox/dwio/parquet/reader/PageReader.cpp:737, Function:skip, Expression:  No decoder to skip, Source: RUNTIME, ErrorCode: INVALID_STATE
unknown file: Failure
C++ exception with description "Exception: VeloxRuntimeError
Error Source: RUNTIME
Error Code: INVALID_STATE
Reason: No decoder to skip
```

Pull Request resolved: #10890

Reviewed By: Yuhta

Differential Revision: D62885150

Pulled By: kevinwilfong

fbshipit-source-id: a940c092f03a6dbf234312e05306760cf3556c26
  • Loading branch information
rui-mo authored and facebook-github-bot committed Sep 17, 2024
1 parent d8b60db commit 91b80ee
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 3 deletions.
6 changes: 6 additions & 0 deletions velox/dwio/parquet/reader/PageReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -719,8 +719,14 @@ void PageReader::skip(int64_t numRows) {
}
firstUnvisited_ += numRows;

if (toSkip == 0) {
return;
}
// Skip nulls
toSkip = skipNulls(toSkip);
if (toSkip == 0) {
return;
}

// Skip the decoder
if (isDictionary()) {
Expand Down
Binary file not shown.
18 changes: 15 additions & 3 deletions velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,6 @@ TEST_F(ParquetTableScanTest, decimalSubfieldFilter) {
"Scalar function signature is not supported: eq(DECIMAL(5, 2), DECIMAL(5, 1))");
}

// Core dump is fixed.
TEST_F(ParquetTableScanTest, map) {
auto vector = makeMapVector<StringView, StringView>({{{"name", "gluten"}}});

Expand Down Expand Up @@ -399,7 +398,6 @@ TEST_F(ParquetTableScanTest, nullMap) {
assertSelectWithFilter({"i", "c"}, {}, "", "SELECT i, c FROM tmp");
}

// Core dump is fixed.
TEST_F(ParquetTableScanTest, singleRowStruct) {
auto vector = makeArrayVector<int32_t>({{}});
loadData(
Expand All @@ -414,7 +412,6 @@ TEST_F(ParquetTableScanTest, singleRowStruct) {
assertSelectWithFilter({"s"}, {}, "", "SELECT (0, 1)");
}

// Core dump and incorrect result are fixed.
TEST_F(ParquetTableScanTest, array) {
auto vector = makeArrayVector<int32_t>({});
loadData(
Expand Down Expand Up @@ -528,6 +525,21 @@ TEST_F(ParquetTableScanTest, reqArrayLegacy) {
"SELECT UNNEST(array[array['a', 'b'], array[], array['c', 'd']])");
}

TEST_F(ParquetTableScanTest, filterOnNestedArray) {
loadData(
getExampleFilePath("struct_of_array.parquet"),
ROW({"struct"},
{ROW({"a0", "a1"}, {ARRAY(VARCHAR()), ARRAY(INTEGER())})}),
makeRowVector(
{"unused"},
{
makeFlatVector<int32_t>({}),
}));

assertSelectWithFilter(
{"struct"}, {}, "struct.a0 is null", "SELECT ROW(NULL, NULL)");
}

TEST_F(ParquetTableScanTest, readAsLowerCase) {
auto plan = PlanBuilder(pool_.get())
.tableScan(ROW({"a"}, {BIGINT()}), {}, "")
Expand Down

0 comments on commit 91b80ee

Please sign in to comment.