diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md index 5670eef08101..922332b15d64 100644 --- a/parquet/CONTRIBUTING.md +++ b/parquet/CONTRIBUTING.md @@ -57,8 +57,10 @@ Run `cargo bench` for benchmarks. ## Docs -To build documentation, run `cargo doc --no-deps`. -To compile and view in the browser, run `cargo doc --no-deps --open`. +To build documentation, run `cargo doc --no-deps --all-features`. +To compile and view in the browser, run `cargo doc --no-deps --all-features --open`. + +Before submitting a pull request, run `cargo fmt --all` to format the change. ## Update Parquet Format diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index bdcbcb81cfce..4cd03c051e62 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -81,6 +81,7 @@ serde_json = { version = "1.0", features = ["std"], default-features = false } arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +object_store = { version = "0.8", default-features = false, features = ["azure"] } [package.metadata.docs.rs] all-features = true diff --git a/parquet/README.md b/parquet/README.md index 2e0ab1d52c30..9de7aec4e59a 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -71,14 +71,6 @@ The `parquet` crate provides the following features which may be enabled in your - [x] Predicate pushdown - [x] Parquet format 4.0.0 support -## Support for `wasm32-unknown-unknown` target - -It's possible to build `parquet` for the `wasm32-unknown-unknown` target, however not all the compression features are currently unsupported due to issues with the upstream crates. In particular, the `zstd` and `lz4` features may have compilation issues. See issue [#180](https://github.com/apache/arrow-rs/issues/180). - -``` -cargo build -p parquet --target wasm32-unknown-unknown --no-default-features --features cli,snap,flate2,brotli -``` - ## License Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0. diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 3e27a96124b0..293b91aea3ba 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -28,7 +28,30 @@ use crate::arrow::async_reader::{AsyncFileReader, MetadataLoader}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ParquetMetaData; -/// Implements [`AsyncFileReader`] for a parquet file in object storage +/// Reads Parquet files in object storage using [`ObjectStore`]. +/// +/// ```no_run +/// # use std::io::stdout; +/// # use std::sync::Arc; +/// # use object_store::azure::MicrosoftAzureBuilder; +/// # use object_store::ObjectStore; +/// # use object_store::path::Path; +/// # use parquet::arrow::async_reader::ParquetObjectReader; +/// # use parquet::arrow::ParquetRecordBatchStreamBuilder; +/// # use parquet::schema::printer::print_parquet_metadata; +/// # async fn run() { +/// // Populate configuration from environment +/// let storage_container = Arc::new(MicrosoftAzureBuilder::from_env().build().unwrap()); +/// let location = Path::from("path/to/blob.parquet"); +/// let meta = storage_container.head(&location).await.unwrap(); +/// println!("Found Blob with {}B at {}", meta.size, meta.location); +/// +/// // Show Parquet metadata +/// let reader = ParquetObjectReader::new(storage_container, meta); +/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap(); +/// print_parquet_metadata(&mut stdout(), builder.metadata()); +/// # } +/// ``` #[derive(Clone, Debug)] pub struct ParquetObjectReader { store: Arc, diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 63885643c0fd..950226aef721 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Provides API for reading/writing Arrow +//! High-level API for reading/writing Arrow //! [RecordBatch](arrow_array::RecordBatch)es and //! [Array](arrow_array::Array)s to/from Parquet Files. //! diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index c20fd38c7f8b..6589d2efaf8b 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Main entrypoint for working with Parquet API. +//! Low level APIs for reading raw parquet data. //! //! Provides access to file and row group readers and writers, record API, metadata, etc. //! diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 921f9df290cc..dd6a0fdd2312 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -134,7 +134,7 @@ pub trait FileReader: Send + Sync { /// Get the `i`th row group reader. Note this doesn't do bound check. fn get_row_group(&self, i: usize) -> Result>; - /// Get full iterator of `Row`s from a file (over all row groups). + /// Get an iterator over the row in this file, see [`RowIter`] for caveats. /// /// Iterator will automatically load the next row group to advance. /// @@ -194,7 +194,7 @@ pub trait RowGroupReader: Send + Sync { /// to read bloom filters. fn get_column_bloom_filter(&self, i: usize) -> Option<&Sbbf>; - /// Get iterator of `Row`s from this row group. + /// Get an iterator over the row in this file, see [`RowIter`] for caveats. /// /// Projected schema can be a subset of or equal to the file schema, when it is None, /// full file schema is assumed. diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 0279bbc382ea..db5d72634389 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -15,24 +15,67 @@ // specific language governing permissions and limitations // under the License. +//! //! This crate contains the official Native Rust implementation of //! [Apache Parquet](https://parquet.apache.org/), part of //! the [Apache Arrow](https://arrow.apache.org/) project. +//! The crate provides a number of APIs to read and write Parquet files, +//! covering a range of use cases. //! //! Please see the [parquet crates.io](https://crates.io/crates/parquet) //! page for feature flags and tips to improve performance. //! -//! # Getting Started -//! Start with some examples: +//! # Format Overview +//! +//! Parquet is a columnar format, which means that unlike row formats like [CSV], values are +//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], with Parquet +//! focusing on storage efficiency whereas Arrow prioritizes compute efficiency. +//! +//! Parquet files are partitioned for scalability. Each file contains metadata, +//! along with zero or more "row groups", each row group containing one or +//! more columns. The APIs in this crate reflect this structure. +//! +//! Parquet distinguishes between "logical" and "physical" data types. +//! For instance, strings (logical type) are stored as byte arrays (physical type). +//! Likewise, temporal types like dates, times, timestamps, etc. (logical type) +//! are stored as integers (physical type). This crate exposes both kinds of types. +//! +//! For more details about the Parquet format, see the +//! [Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format). +//! +//! # APIs +//! +//! This crate exposes a number of APIs for different use-cases. +//! +//! ## Read/Write Arrow +//! +//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow `RecordBatch`. +//! This makes for a simple and performant interface to parquet data, whilst allowing workloads +//! to leverage the wide range of data transforms provided by the [arrow] crate, and by the +//! ecosystem of libraries and services using [Arrow] as an interop format. +//! +//! ## Read/Write Arrow Async +//! +//! When the `async` feature is enabled, [`arrow::async_reader`] and [`arrow::async_writer`] +//! provide the ability to read and write [`arrow`] data asynchronously. Additionally, with the +//! `object_store` feature is enabled, [`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader) +//! provides efficient integration with object storage services such as S3 via the [object_store] +//! crate, automatically optimizing IO based on any predicates or projections provided. //! -//! 1. [mod@file] for reading and writing parquet files using the -//! [ColumnReader](column::reader::ColumnReader) API. +//! ## Read/Write Parquet //! -//! 2. [arrow] for reading and writing parquet files to Arrow -//! `RecordBatch`es +//! Workloads needing finer-grained control, or looking to not take a dependency on arrow, +//! can use the lower-level APIs in [`mod@file`]. These APIs expose the underlying parquet +//! data model, and therefore require knowledge of the underlying parquet format, +//! including the details of [Dremel] record shredding and [Logical Types]. Most workloads +//! should prefer the arrow interfaces. //! -//! 3. [arrow::async_reader] and [arrow::async_writer] for `async` reading -//! and writing parquet files to Arrow `RecordBatch`es (requires the `async` feature). +//! [arrow]: https://docs.rs/arrow/latest/arrow/index.html +//! [Arrow]: https://arrow.apache.org/ +//! [CSV]: https://en.wikipedia.org/wiki/Comma-separated_values +//! [Dremel]: https://research.google/pubs/pub36632/ +//! [Logical Types]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md +//! [object_store]: https://docs.rs/object_store/latest/object_store/ /// Defines a an item with an experimental public API /// diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index f98939725517..feaa8055e2dd 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -609,9 +609,20 @@ impl<'a> Either<'a> { } } -/// Iterator of [`Row`]s. -/// It is used either for a single row group to iterate over data in that row group, or -/// an entire file with auto buffering of all row groups. +/// Access parquet data as an iterator of [`Row`] +/// +/// # Caveats +/// +/// Parquet stores data in a columnar fashion using [Dremel] encoding, and is therefore highly +/// optimised for reading data by column, not row. As a consequence applications concerned with +/// performance should prefer the columnar arrow or [ColumnReader] APIs. +/// +/// Additionally the current implementation does not correctly handle repeated fields ([#2394]), +/// and workloads looking to handle such schema should use the other APIs. +/// +/// [#2394]: https://github.com/apache/arrow-rs/issues/2394 +/// [ColumnReader]: crate::file::reader::RowGroupReader::get_column_reader +/// [Dremel]: https://research.google/pubs/pub36632/ pub struct RowIter<'a> { descr: SchemaDescPtr, tree_builder: TreeBuilder,