Skip to content

Commit

Permalink
Workaround for missing Parquet page indexes in ParquetMetadaReader (#…
Browse files Browse the repository at this point in the history
…6450)

* workaround for missing page indexes

* remove empty line

* Apply suggestions from code review

Co-authored-by: Andrew Lamb <[email protected]>

* fmt

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
etseidl and alamb authored Sep 25, 2024
1 parent 6137e91 commit d48010e
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 24 deletions.
26 changes: 6 additions & 20 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField};
use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask};
use crate::column::page::{PageIterator, PageReader};
use crate::errors::{ParquetError, Result};
use crate::file::footer;
use crate::file::metadata::ParquetMetaData;
use crate::file::page_index::index_reader;
use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
use crate::file::reader::{ChunkReader, SerializedPageReader};
use crate::schema::types::SchemaDescriptor;

Expand Down Expand Up @@ -382,23 +380,9 @@ impl ArrowReaderMetadata {
/// `Self::metadata` is missing the page index, this function will attempt
/// to load the page index by making an object store request.
pub fn load<T: ChunkReader>(reader: &T, options: ArrowReaderOptions) -> Result<Self> {
let mut metadata = footer::parse_metadata(reader)?;
if options.page_index {
let column_index = metadata
.row_groups()
.iter()
.map(|rg| index_reader::read_columns_indexes(reader, rg.columns()))
.collect::<Result<Vec<_>>>()?;
metadata.set_column_index(Some(column_index));

let offset_index = metadata
.row_groups()
.iter()
.map(|rg| index_reader::read_offset_indexes(reader, rg.columns()))
.collect::<Result<Vec<_>>>()?;

metadata.set_offset_index(Some(offset_index))
}
let metadata = ParquetMetaDataReader::new()
.with_page_indexes(options.page_index)
.parse_and_finish(reader)?;
Self::try_new(Arc::new(metadata), options)
}

Expand Down Expand Up @@ -3498,6 +3482,8 @@ mod tests {
.unwrap();
// Although `Vec<Vec<PageLoacation>>` of each row group is empty,
// we should read the file successfully.
// FIXME: this test will fail when metadata parsing returns `None` for missing page
// indexes. https://github.com/apache/arrow-rs/issues/6447
assert!(builder.metadata().offset_index().unwrap()[0].is_empty());
let reader = builder.build().unwrap();
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
Expand Down
28 changes: 24 additions & 4 deletions parquet/src/file/metadata/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ impl ParquetMetaDataReader {
// need for more data. This is not it's intended use. The plan is to add a NeedMoreData
// value to the enum, but this would be a breaking change. This will be done as
// 54.0.0 draws nearer.
// https://github.com/apache/arrow-rs/issues/6447
Err(ParquetError::IndexOutOfBound(needed, _)) => {
// If reader is the same length as `file_size` then presumably there is no more to
// read, so return an EOF error.
Expand Down Expand Up @@ -247,13 +248,18 @@ impl ParquetMetaDataReader {
));
}

// TODO(ets): what is the correct behavior for missing page indexes? MetadataLoader would
// leave them as `None`, while the parser in `index_reader::read_columns_indexes` returns a
// vector of empty vectors.
// I think it's best to leave them as `None`.
// FIXME: there are differing implementations in the case where page indexes are missing
// from the file. `MetadataLoader` will leave them as `None`, while the parser in
// `index_reader::read_columns_indexes` returns a vector of empty vectors.
// It is best for this function to replicate the latter behavior for now, but in a future
// breaking release, the two paths to retrieve metadata should be made consistent. Note that this is only
// an issue if the user requested page indexes, so there is no need to provide empty
// vectors in `try_parse_sized()`.
// https://github.com/apache/arrow-rs/issues/6447

// Get bounds needed for page indexes (if any are present in the file).
let Some(range) = self.range_for_page_index() else {
self.empty_page_indexes();
return Ok(());
};

Expand Down Expand Up @@ -412,6 +418,20 @@ impl ParquetMetaDataReader {
Ok(())
}

/// Set the column_index and offset_indexes to empty `Vec` for backwards compatibility
///
/// See <https://github.com/apache/arrow-rs/pull/6451> for details
fn empty_page_indexes(&mut self) {
let metadata = self.metadata.as_mut().unwrap();
let num_row_groups = metadata.num_row_groups();
if self.column_index {
metadata.set_column_index(Some(vec![vec![]; num_row_groups]));
}
if self.offset_index {
metadata.set_offset_index(Some(vec![vec![]; num_row_groups]));
}
}

fn range_for_page_index(&self) -> Option<Range<usize>> {
// sanity check
self.metadata.as_ref()?;
Expand Down

0 comments on commit d48010e

Please sign in to comment.