From 0b0a914a868fcf8d3d0ff652c13b5cbe4db9f61a Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Sun, 3 Nov 2024 18:26:16 +0100 Subject: [PATCH] fix: Fix mask and validity confusion in Parquet String decoding (#19614) --- .../src/arrow/read/deserialize/binview.rs | 3 +- py-polars/tests/unit/io/test_parquet.py | 49 ++++++++++++++++++- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binview.rs b/crates/polars-parquet/src/arrow/read/deserialize/binview.rs index 62a0e7f2df4e..f9b553bea397 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binview.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binview.rs @@ -276,13 +276,12 @@ fn decode_masked_optional_plain( verify_utf8, ); } - if page_validity.unset_bits() == 0 { return decode_masked_required_plain( num_expected_values, values, target, - page_validity, + mask, verify_utf8, ); } diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index fe75fccd1ad0..773862591f0e 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -20,6 +20,7 @@ from polars.exceptions import ComputeError from polars.testing import assert_frame_equal, assert_series_equal from polars.testing.parametric import column, dataframes +from polars.testing.parametric.strategies.core import series if TYPE_CHECKING: from pathlib import Path @@ -1560,6 +1561,53 @@ def test_predicate_filtering( assert_frame_equal(result, df.filter(expr)) +@pytest.mark.parametrize( + "use_dictionary", + [False, True], +) +@pytest.mark.parametrize( + "data_page_size", + [1, None], +) +@given( + s=series( + min_size=1, + max_size=10, + excluded_dtypes=[ + pl.Decimal, + pl.Categorical, + pl.Enum, + pl.Struct, # See #19612. + ], + ), + offset=st.integers(0, 10), + length=st.integers(0, 10), +) +def test_pyarrow_slice_roundtrip( + s: pl.Series, + use_dictionary: bool, + data_page_size: int | None, + offset: int, + length: int, +) -> None: + offset %= len(s) + 1 + length %= len(s) - offset + 1 + + f = io.BytesIO() + df = s.to_frame() + pq.write_table( + df.to_arrow(), + f, + compression="NONE", + use_dictionary=use_dictionary, + data_page_size=data_page_size, + ) + + f.seek(0) + scanned = pl.scan_parquet(f).slice(offset, length).collect() + assert_frame_equal(scanned, df.slice(offset, length)) + + @given( df=dataframes( min_size=1, @@ -1579,7 +1627,6 @@ def test_slice_roundtrip(df: pl.DataFrame, offset: int, length: int) -> None: df.write_parquet(f) f.seek(0) - print((offset, length)) scanned = pl.scan_parquet(f).slice(offset, length).collect() assert_frame_equal(scanned, df.slice(offset, length))