diff --git a/Cargo.lock b/Cargo.lock index ae2dace2084f..18d4b759403a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,8 +247,6 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1bb018b6960c87fd9d025009820406f74e83281185a8bdcb44880d2aa5c9a87" dependencies = [ "arrow-arith", "arrow-array", @@ -271,8 +269,6 @@ dependencies = [ [[package]] name = "arrow-arith" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44de76b51473aa888ecd6ad93ceb262fb8d40d1f1154a4df2f069b3590aa7575" dependencies = [ "arrow-array", "arrow-buffer", @@ -285,8 +281,6 @@ dependencies = [ [[package]] name = "arrow-array" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29ed77e22744475a9a53d00026cf8e166fe73cf42d89c4c4ae63607ee1cfcc3f" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -302,8 +296,6 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0391c96eb58bf7389171d1e103112d3fc3e5625ca6b372d606f2688f1ea4cce" dependencies = [ "bytes", "half", @@ -313,8 +305,6 @@ dependencies = [ [[package]] name = "arrow-cast" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f39e1d774ece9292697fcbe06b5584401b26bd34be1bec25c33edae65c2420ff" dependencies = [ "arrow-array", "arrow-buffer", @@ -334,8 +324,6 @@ dependencies = [ [[package]] name = "arrow-csv" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9055c972a07bf12c2a827debfd34f88d3b93da1941d36e1d9fee85eebe38a12a" dependencies = [ "arrow-array", "arrow-cast", @@ -350,8 +338,6 @@ dependencies = [ [[package]] name = "arrow-data" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf75ac27a08c7f48b88e5c923f267e980f27070147ab74615ad85b5c5f90473d" dependencies = [ "arrow-buffer", "arrow-schema", @@ -362,8 +348,6 @@ dependencies = [ [[package]] name = "arrow-flight" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91efc67a4f5a438833dd76ef674745c80f6f6b9a428a3b440cbfbf74e32867e6" dependencies = [ "arrow-arith", "arrow-array", @@ -389,8 +373,6 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a222f0d93772bd058d1268f4c28ea421a603d66f7979479048c429292fac7b2e" dependencies = [ "arrow-array", "arrow-buffer", @@ -403,8 +385,6 @@ dependencies = [ [[package]] name = "arrow-json" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9085342bbca0f75e8cb70513c0807cc7351f1fbf5cb98192a67d5e3044acb033" dependencies = [ "arrow-array", "arrow-buffer", @@ -425,8 +405,6 @@ dependencies = [ [[package]] name = "arrow-ord" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab2f1065a5cad7b9efa9e22ce5747ce826aa3855766755d4904535123ef431e7" dependencies = [ "arrow-array", "arrow-buffer", @@ -438,8 +416,6 @@ dependencies = [ [[package]] name = "arrow-row" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3703a0e3e92d23c3f756df73d2dc9476873f873a76ae63ef9d3de17fda83b2d8" dependencies = [ "arrow-array", "arrow-buffer", @@ -451,8 +427,6 @@ dependencies = [ [[package]] name = "arrow-schema" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b" dependencies = [ "bitflags 2.9.1", "serde", @@ -462,8 +436,6 @@ dependencies = [ [[package]] name = "arrow-select" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24b7b85575702b23b85272b01bc1c25a01c9b9852305e5d0078c79ba25d995d4" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -476,8 +448,6 @@ dependencies = [ [[package]] name = "arrow-string" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9260fddf1cdf2799ace2b4c2fc0356a9789fa7551e0953e35435536fecefebbd" dependencies = [ "arrow-array", "arrow-buffer", @@ -4419,8 +4389,6 @@ dependencies = [ [[package]] name = "parquet" version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be7b2d778f6b841d37083ebdf32e33a524acde1266b5884a8ca29bf00dfa1231" dependencies = [ "ahash 0.8.12", "arrow-array", diff --git a/Cargo.toml b/Cargo.toml index 64483eeb93da..10ada13809fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -88,19 +88,19 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.17", default-features = false } -arrow = { version = "55.1.0", features = [ +arrow = { path = "/Users/zhuqi/arrow-rs/arrow", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "55.0.0", default-features = false } -arrow-flight = { version = "55.1.0", features = [ +arrow-buffer = {path = "/Users/zhuqi/arrow-rs/arrow-buffer", default-features = false } +arrow-flight = { path = "/Users/zhuqi/arrow-rs/arrow-flight", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "55.0.0", default-features = false, features = [ +arrow-ipc = { path = "/Users/zhuqi/arrow-rs/arrow-ipc", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "55.0.0", default-features = false } -arrow-schema = { version = "55.0.0", default-features = false } +arrow-ord = { path = "/Users/zhuqi/arrow-rs/arrow-ord", default-features = false } +arrow-schema = { path = "/Users/zhuqi/arrow-rs/arrow-schema", default-features = false } async-trait = "0.1.88" bigdecimal = "0.4.8" bytes = "1.10" @@ -151,7 +151,7 @@ itertools = "0.14" log = "^0.4" object_store = { version = "0.12.0", default-features = false } parking_lot = "0.12" -parquet = { version = "55.1.0", default-features = false, features = [ +parquet = {path = "/Users/zhuqi/arrow-rs/parquet", default-features = false, features = [ "arrow", "async", "object_store", diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 9e14425074f7..ea41e8c84e6a 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -186,17 +186,15 @@ impl FileOpener for ParquetOpener { &predicate_creation_errors, ); - // The page index is not stored inline in the parquet footer so the - // code above may not have read the page index structures yet. If we - // need them for reading and they aren't yet loaded, we need to load them now. if should_enable_page_index(enable_page_index, &page_pruning_predicate) { + let col_idxs: Vec = page_pruning_predicate.as_ref().unwrap().columns_needed(); reader_metadata = load_page_index( reader_metadata, &mut async_file_reader, - // Since we're manually loading the page index the option here should not matter but we pass it in for consistency options.with_page_index(true), + &col_idxs, ) - .await?; + .await?; } metadata_timer.stop(); @@ -418,6 +416,7 @@ async fn load_page_index( reader_metadata: ArrowReaderMetadata, input: &mut T, options: ArrowReaderOptions, + col_idxs: &[usize], ) -> Result { let parquet_metadata = reader_metadata.metadata(); let missing_column_index = parquet_metadata.column_index().is_none(); @@ -432,7 +431,8 @@ async fn load_page_index( .unwrap_or_else(|e| e.as_ref().clone()); let mut reader = ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true); - reader.load_page_index(input).await?; + reader.load_page_index_with_columns(input, col_idxs).await?; + let new_parquet_metadata = reader.finish()?; let new_arrow_reader = ArrowReaderMetadata::try_new(Arc::new(new_parquet_metadata), options)?; diff --git a/datafusion/datasource-parquet/src/page_filter.rs b/datafusion/datasource-parquet/src/page_filter.rs index 84f5c4c2d6d5..ee336a61a55e 100644 --- a/datafusion/datasource-parquet/src/page_filter.rs +++ b/datafusion/datasource-parquet/src/page_filter.rs @@ -279,6 +279,17 @@ impl PagePruningAccessPlanFilter { pub fn filter_number(&self) -> usize { self.predicates.len() } + + /// Returns the columns needed to evaluate the page predicates + pub fn columns_needed(&self) -> Vec { + self.predicates.iter() + .filter_map(|pp| pp.required_columns().single_column()) + .map(|column| { + // Get the index of the column in the parquet file + column.index() + }) + .collect() + } } fn update_selection(