From 109c404cfbefca53eacb1f5e1f45d453b621d5d3 Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Wed, 16 Oct 2024 08:46:41 +0200 Subject: [PATCH] fix: Parquet predicate pushdown for `lit(_) !=` (#19246) --- crates/polars-expr/src/expressions/apply.rs | 8 ----- crates/polars-expr/src/expressions/binary.rs | 6 +--- .../polars-io/src/parquet/read/predicates.rs | 33 +++++++++++++++---- py-polars/tests/unit/io/test_lazy_parquet.py | 6 ++-- 4 files changed, 31 insertions(+), 22 deletions(-) diff --git a/crates/polars-expr/src/expressions/apply.rs b/crates/polars-expr/src/expressions/apply.rs index 8c71c90cd152..53579b763033 100644 --- a/crates/polars-expr/src/expressions/apply.rs +++ b/crates/polars-expr/src/expressions/apply.rs @@ -525,14 +525,6 @@ fn apply_multiple_elementwise<'a>( impl StatsEvaluator for ApplyExpr { fn should_read(&self, stats: &BatchStats) -> PolarsResult { let read = self.should_read_impl(stats)?; - if ExecutionState::new().verbose() { - if read { - eprintln!("parquet file must be read, statistics not sufficient for predicate.") - } else { - eprintln!("parquet file can be skipped, the statistics were sufficient to apply the predicate.") - } - } - Ok(read) } } diff --git a/crates/polars-expr/src/expressions/binary.rs b/crates/polars-expr/src/expressions/binary.rs index c1cb286e7104..23f50af45273 100644 --- a/crates/polars-expr/src/expressions/binary.rs +++ b/crates/polars-expr/src/expressions/binary.rs @@ -351,7 +351,7 @@ mod stats { use ChunkCompareIneq as C; match op { Operator::Eq => apply_operator_stats_eq(min_max, literal), - Operator::NotEq => apply_operator_stats_eq(min_max, literal), + Operator::NotEq => apply_operator_stats_neq(min_max, literal), Operator::Gt => { // Literal is bigger than max value, selection needs all rows. C::gt(literal, min_max).map(|ca| ca.any()).unwrap_or(false) @@ -454,10 +454,6 @@ mod stats { impl StatsEvaluator for BinaryExpr { fn should_read(&self, stats: &BatchStats) -> PolarsResult { - if std::env::var("POLARS_NO_PARQUET_STATISTICS").is_ok() { - return Ok(true); - } - use Operator::*; match ( self.left.as_stats_evaluator(), diff --git a/crates/polars-io/src/parquet/read/predicates.rs b/crates/polars-io/src/parquet/read/predicates.rs index eb8f7747f078..a3269341c1a3 100644 --- a/crates/polars-io/src/parquet/read/predicates.rs +++ b/crates/polars-io/src/parquet/read/predicates.rs @@ -1,3 +1,4 @@ +use polars_core::config; use polars_core::prelude::*; use polars_parquet::read::statistics::{deserialize, Statistics}; use polars_parquet::read::RowGroupMetadata; @@ -50,18 +51,38 @@ pub fn read_this_row_group( md: &RowGroupMetadata, schema: &ArrowSchema, ) -> PolarsResult { + if std::env::var("POLARS_NO_PARQUET_STATISTICS").is_ok() { + return Ok(true); + } + + let mut should_read = true; + if let Some(pred) = predicate { if let Some(pred) = pred.as_stats_evaluator() { if let Some(stats) = collect_statistics(md, schema)? { - let should_read = pred.should_read(&stats); + let pred_result = pred.should_read(&stats); + // a parquet file may not have statistics of all columns - if matches!(should_read, Ok(false)) { - return Ok(false); - } else if !matches!(should_read, Err(PolarsError::ColumnNotFound(_))) { - let _ = should_read?; + match pred_result { + Err(PolarsError::ColumnNotFound(errstr)) => { + return Err(PolarsError::ColumnNotFound(errstr)) + }, + Ok(false) => should_read = false, + _ => {}, } } } + + if config::verbose() { + if should_read { + eprintln!( + "parquet row group must be read, statistics not sufficient for predicate." + ); + } else { + eprintln!("parquet row group can be skipped, the statistics were sufficient to apply the predicate."); + } + } } - Ok(true) + + Ok(should_read) } diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py index 792ddc42b02a..06efe16330d3 100644 --- a/py-polars/tests/unit/io/test_lazy_parquet.py +++ b/py-polars/tests/unit/io/test_lazy_parquet.py @@ -235,12 +235,12 @@ def test_parquet_is_in_statistics(monkeypatch: Any, capfd: Any, tmp_path: Path) captured = capfd.readouterr().err assert ( - "parquet file must be read, statistics not sufficient for predicate." + "parquet row group must be read, statistics not sufficient for predicate." in captured ) assert ( - "parquet file can be skipped, the statistics were sufficient" - " to apply the predicate." in captured + "parquet row group can be skipped, the statistics were sufficient to apply the predicate." + in captured )