From 51bcadbcd13f0775d40f153263cb02a3a5b57056 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 11 Mar 2024 15:39:50 +1300 Subject: [PATCH] Better document parquet pushdown (#5491) --- parquet/src/arrow/arrow_reader/filter.rs | 4 ++++ parquet/src/arrow/arrow_reader/mod.rs | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/parquet/src/arrow/arrow_reader/filter.rs b/parquet/src/arrow/arrow_reader/filter.rs index a80255f413ee..4686e1512727 100644 --- a/parquet/src/arrow/arrow_reader/filter.rs +++ b/parquet/src/arrow/arrow_reader/filter.rs @@ -96,6 +96,10 @@ where /// leaves 99% of the rows, it may be better to not filter the data from parquet and /// apply the filter after the RecordBatch has been fully decoded. /// +/// Additionally, even if a predicate eliminates a moderate number of rows, it may still be faster +/// to filter the data after the RecordBatch has been fully decoded, if the eliminated rows are +/// not contiguous. +/// /// [`RowSelection`]: crate::arrow::arrow_reader::RowSelection pub struct RowFilter { /// A list of [`ArrowPredicate`] diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a34ce77f2778..83d6f6f553fc 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -141,6 +141,9 @@ impl ArrowReaderBuilder { /// An example use case of this would be applying a selection determined by /// evaluating predicates against the [`Index`] /// + /// It is recommended to enable reading the page index if using this functionality, to allow + /// more efficient skipping over data pages. See [`ArrowReaderOptions::with_page_index`] + /// /// [`Index`]: crate::file::page_index::index::Index pub fn with_row_selection(self, selection: RowSelection) -> Self { Self { @@ -152,6 +155,9 @@ impl ArrowReaderBuilder { /// Provide a [`RowFilter`] to skip decoding rows /// /// Row filters are applied after row group selection and row selection + /// + /// It is recommended to enable reading the page index if using this functionality, to allow + /// more efficient skipping over data pages. See [`ArrowReaderOptions::with_page_index`]. pub fn with_row_filter(self, filter: RowFilter) -> Self { Self { filter: Some(filter), @@ -163,6 +169,9 @@ impl ArrowReaderBuilder { /// /// The limit will be applied after any [`Self::with_row_selection`] and [`Self::with_row_filter`] /// allowing it to limit the final set of rows decoded after any pushed down predicates + /// + /// It is recommended to enable reading the page index if using this functionality, to allow + /// more efficient skipping over data pages. See [`ArrowReaderOptions::with_page_index`] pub fn with_limit(self, limit: usize) -> Self { Self { limit: Some(limit), @@ -174,6 +183,9 @@ impl ArrowReaderBuilder { /// /// The offset will be applied after any [`Self::with_row_selection`] and [`Self::with_row_filter`] /// allowing it to skip rows after any pushed down predicates + /// + /// It is recommended to enable reading the page index if using this functionality, to allow + /// more efficient skipping over data pages. See [`ArrowReaderOptions::with_page_index`] pub fn with_offset(self, offset: usize) -> Self { Self { offset: Some(offset),