apache · zhuqi-lucas · Jun 8, 2025 · zhuqi-lucas · Jun 8, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -88,19 +88,19 @@ ahash = { version = "0.8", default-features = false, features = [
     "runtime-rng",
 ] }
 apache-avro = { version = "0.17", default-features = false }
-arrow = { version = "55.1.0", features = [
+arrow = { path = "/Users/zhuqi/arrow-rs/arrow", features = [
     "prettyprint",
     "chrono-tz",
 ] }
-arrow-buffer = { version = "55.0.0", default-features = false }
-arrow-flight = { version = "55.1.0", features = [
+arrow-buffer = {path = "/Users/zhuqi/arrow-rs/arrow-buffer", default-features = false }
+arrow-flight = { path = "/Users/zhuqi/arrow-rs/arrow-flight", features = [
     "flight-sql-experimental",
 ] }
-arrow-ipc = { version = "55.0.0", default-features = false, features = [
+arrow-ipc = { path = "/Users/zhuqi/arrow-rs/arrow-ipc", default-features = false, features = [
     "lz4",
 ] }
-arrow-ord = { version = "55.0.0", default-features = false }
-arrow-schema = { version = "55.0.0", default-features = false }
+arrow-ord = { path = "/Users/zhuqi/arrow-rs/arrow-ord", default-features = false }
+arrow-schema = { path = "/Users/zhuqi/arrow-rs/arrow-schema", default-features = false }
 async-trait = "0.1.88"
 bigdecimal = "0.4.8"
 bytes = "1.10"
@@ -151,7 +151,7 @@ itertools = "0.14"
 log = "^0.4"
 object_store = { version = "0.12.0", default-features = false }
 parking_lot = "0.12"
-parquet = { version = "55.1.0", default-features = false, features = [
+parquet = {path = "/Users/zhuqi/arrow-rs/parquet", default-features = false, features = [
     "arrow",
     "async",
     "object_store",

diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
@@ -186,17 +186,15 @@ impl FileOpener for ParquetOpener {
                 &predicate_creation_errors,
             );
 
-            // The page index is not stored inline in the parquet footer so the
-            // code above may not have read the page index structures yet. If we
-            // need them for reading and they aren't yet loaded, we need to load them now.
             if should_enable_page_index(enable_page_index, &page_pruning_predicate) {
+                let col_idxs: Vec<usize> = page_pruning_predicate.as_ref().unwrap().columns_needed();
                 reader_metadata = load_page_index(
                     reader_metadata,
                     &mut async_file_reader,
-                    // Since we're manually loading the page index the option here should not matter but we pass it in for consistency
                     options.with_page_index(true),
+                    &col_idxs,
                 )
-                .await?;
+                    .await?;
             }
 
             metadata_timer.stop();
@@ -418,6 +416,7 @@ async fn load_page_index<T: AsyncFileReader>(
     reader_metadata: ArrowReaderMetadata,
     input: &mut T,
     options: ArrowReaderOptions,
+    col_idxs: &[usize],
 ) -> Result<ArrowReaderMetadata> {
     let parquet_metadata = reader_metadata.metadata();
     let missing_column_index = parquet_metadata.column_index().is_none();
@@ -432,7 +431,8 @@ async fn load_page_index<T: AsyncFileReader>(
             .unwrap_or_else(|e| e.as_ref().clone());
         let mut reader =
             ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true);
-        reader.load_page_index(input).await?;
+        reader.load_page_index_with_columns(input, col_idxs).await?;
+
         let new_parquet_metadata = reader.finish()?;
         let new_arrow_reader =
             ArrowReaderMetadata::try_new(Arc::new(new_parquet_metadata), options)?;

diff --git a/datafusion/datasource-parquet/src/page_filter.rs b/datafusion/datasource-parquet/src/page_filter.rs
@@ -279,6 +279,17 @@ impl PagePruningAccessPlanFilter {
     pub fn filter_number(&self) -> usize {
         self.predicates.len()
     }
+
+    /// Returns the columns needed to evaluate the page predicates
+    pub fn columns_needed(&self) -> Vec<usize> {
+        self.predicates.iter()
+            .filter_map(|pp| pp.required_columns().single_column())
+            .map(|column| {
+                // Get the index of the column in the parquet file
+                column.index()
+            })
+            .collect()
+    }
 }
 
 fn update_selection(