Expand parquet crate overview doc (#5093)

* Expand parquet crate overview doc * Run `cargo fmt --all` * Nit: ask to format the code before sending a PR * Add example reading Parquet files from cloud provider * Tweak copy * Fix doctest --------- Co-authored-by: Matthieu Maitre <[email protected]> Co-authored-by: Raphael Taylor-Davies <[email protected]>
apache · Nov 20, 2023 · 6815bf1 · 6815bf1
1 parent 61da64a
commit 6815bf1
Show file tree

Hide file tree

Showing 9 changed files with 98 additions and 26 deletions.
diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md
@@ -57,8 +57,10 @@ Run `cargo bench` for benchmarks.
 
 ## Docs
 
-To build documentation, run `cargo doc --no-deps`.
-To compile and view in the browser, run `cargo doc --no-deps --open`.
+To build documentation, run `cargo doc --no-deps --all-features`.
+To compile and view in the browser, run `cargo doc --no-deps --all-features --open`.
+
+Before submitting a pull request, run `cargo fmt --all` to format the change.
 
 ## Update Parquet Format
 

diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
@@ -81,6 +81,7 @@ serde_json = { version = "1.0", features = ["std"], default-features = false }
 arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint", "json"] }
 tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] }
 rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
+object_store = { version = "0.8", default-features = false, features = ["azure"] }
 
 [package.metadata.docs.rs]
 all-features = true

diff --git a/parquet/README.md b/parquet/README.md
@@ -71,14 +71,6 @@ The `parquet` crate provides the following features which may be enabled in your
 - [x] Predicate pushdown
 - [x] Parquet format 4.0.0 support
 
-## Support for `wasm32-unknown-unknown` target
-
-It's possible to build `parquet` for the `wasm32-unknown-unknown` target, however not all the compression features are currently unsupported due to issues with the upstream crates. In particular, the `zstd` and `lz4` features may have compilation issues. See issue [#180](https://github.com/apache/arrow-rs/issues/180).
-
-```
-cargo build -p parquet --target wasm32-unknown-unknown --no-default-features --features cli,snap,flate2,brotli
-```
-
 ## License
 
 Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.
diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs
@@ -28,7 +28,30 @@ use crate::arrow::async_reader::{AsyncFileReader, MetadataLoader};
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::ParquetMetaData;
 
-/// Implements [`AsyncFileReader`] for a parquet file in object storage
+/// Reads Parquet files in object storage using [`ObjectStore`].
+///
+/// ```no_run
+/// # use std::io::stdout;
+/// # use std::sync::Arc;
+/// # use object_store::azure::MicrosoftAzureBuilder;
+/// # use object_store::ObjectStore;
+/// # use object_store::path::Path;
+/// # use parquet::arrow::async_reader::ParquetObjectReader;
+/// # use parquet::arrow::ParquetRecordBatchStreamBuilder;
+/// # use parquet::schema::printer::print_parquet_metadata;
+/// # async fn run() {
+/// // Populate configuration from environment
+/// let storage_container = Arc::new(MicrosoftAzureBuilder::from_env().build().unwrap());
+/// let location = Path::from("path/to/blob.parquet");
+/// let meta = storage_container.head(&location).await.unwrap();
+/// println!("Found Blob with {}B at {}", meta.size, meta.location);
+///
+/// // Show Parquet metadata
+/// let reader = ParquetObjectReader::new(storage_container, meta);
+/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
+/// print_parquet_metadata(&mut stdout(), builder.metadata());
+/// # }
+/// ```
 #[derive(Clone, Debug)]
 pub struct ParquetObjectReader {
     store: Arc<dyn ObjectStore>,

diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Provides API for reading/writing Arrow
+//! High-level API for reading/writing Arrow
 //! [RecordBatch](arrow_array::RecordBatch)es and
 //! [Array](arrow_array::Array)s to/from Parquet Files.
 //!

diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Main entrypoint for working with Parquet API.
+//! Low level APIs for reading raw parquet data.
 //!
 //! Provides access to file and row group readers and writers, record API, metadata, etc.
 //!

diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs
@@ -134,7 +134,7 @@ pub trait FileReader: Send + Sync {
     /// Get the `i`th row group reader. Note this doesn't do bound check.
     fn get_row_group(&self, i: usize) -> Result<Box<dyn RowGroupReader + '_>>;
 
-    /// Get full iterator of `Row`s from a file (over all row groups).
+    /// Get an iterator over the row in this file, see [`RowIter`] for caveats.
     ///
     /// Iterator will automatically load the next row group to advance.
     ///
@@ -194,7 +194,7 @@ pub trait RowGroupReader: Send + Sync {
     /// to read bloom filters.
     fn get_column_bloom_filter(&self, i: usize) -> Option<&Sbbf>;
 
-    /// Get iterator of `Row`s from this row group.
+    /// Get an iterator over the row in this file, see [`RowIter`] for caveats.
     ///
     /// Projected schema can be a subset of or equal to the file schema, when it is None,
     /// full file schema is assumed.

diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
@@ -15,24 +15,67 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//!
 //! This crate contains the official Native Rust implementation of
 //! [Apache Parquet](https://parquet.apache.org/), part of
 //! the [Apache Arrow](https://arrow.apache.org/) project.
+//! The crate provides a number of APIs to read and write Parquet files,
+//! covering a range of use cases.
 //!
 //! Please see the [parquet crates.io](https://crates.io/crates/parquet)
 //! page for feature flags and tips to improve performance.
 //!
-//! # Getting Started
-//! Start with some examples:
+//! # Format Overview
+//!
+//! Parquet is a columnar format, which means that unlike row formats like [CSV], values are
+//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], with Parquet
+//! focusing on storage efficiency whereas Arrow prioritizes compute efficiency.
+//!
+//! Parquet files are partitioned for scalability. Each file contains metadata,
+//! along with zero or more "row groups", each row group containing one or
+//! more columns. The APIs in this crate reflect this structure.
+//!
+//! Parquet distinguishes between "logical" and "physical" data types.
+//! For instance, strings (logical type) are stored as byte arrays (physical type).
+//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
+//! are stored as integers (physical type). This crate exposes both kinds of types.
+//!
+//! For more details about the Parquet format, see the
+//! [Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format).
+//!
+//! # APIs
+//!
+//! This crate exposes a number of APIs for different use-cases.
+//!
+//! ## Read/Write Arrow
+//!
+//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow `RecordBatch`.
+//! This makes for a simple and performant interface to parquet data, whilst allowing workloads
+//! to leverage the wide range of data transforms provided by the [arrow] crate, and by the
+//! ecosystem of libraries and services using [Arrow] as an interop format.
+//!
+//! ## Read/Write Arrow Async
+//!
+//! When the `async` feature is enabled, [`arrow::async_reader`] and [`arrow::async_writer`]
+//! provide the ability to read and write [`arrow`] data asynchronously. Additionally, with the
+//! `object_store` feature is enabled, [`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader)
+//! provides efficient integration with object storage services such as S3 via the [object_store]
+//! crate, automatically optimizing IO based on any predicates or projections provided.
 //!
-//! 1. [mod@file] for reading and writing parquet files using the
-//! [ColumnReader](column::reader::ColumnReader) API.
+//! ## Read/Write Parquet
 //!
-//! 2. [arrow] for reading and writing parquet files to Arrow
-//! `RecordBatch`es
+//! Workloads needing finer-grained control, or looking to not take a dependency on arrow,
+//! can use the lower-level APIs in [`mod@file`]. These APIs expose the underlying parquet
+//! data model, and therefore require knowledge of the underlying parquet format,
+//! including the details of [Dremel] record shredding and [Logical Types]. Most workloads
+//! should prefer the arrow interfaces.
 //!
-//! 3. [arrow::async_reader] and [arrow::async_writer] for `async` reading
-//! and writing parquet files to Arrow `RecordBatch`es (requires the `async` feature).
+//! [arrow]: https://docs.rs/arrow/latest/arrow/index.html
+//! [Arrow]: https://arrow.apache.org/
+//! [CSV]: https://en.wikipedia.org/wiki/Comma-separated_values
+//! [Dremel]: https://research.google/pubs/pub36632/
+//! [Logical Types]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
+//! [object_store]: https://docs.rs/object_store/latest/object_store/
 
 /// Defines a an item with an experimental public API
 ///

diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs
@@ -609,9 +609,20 @@ impl<'a> Either<'a> {
     }
 }
 
-/// Iterator of [`Row`]s.
-/// It is used either for a single row group to iterate over data in that row group, or
-/// an entire file with auto buffering of all row groups.
+/// Access parquet data as an iterator of [`Row`]
+///
+/// # Caveats
+///
+/// Parquet stores data in a columnar fashion using [Dremel] encoding, and is therefore highly
+/// optimised for reading data by column, not row. As a consequence applications concerned with
+/// performance should prefer the columnar arrow or [ColumnReader] APIs.
+///
+/// Additionally the current implementation does not correctly handle repeated fields ([#2394]),
+/// and workloads looking to handle such schema should use the other APIs.
+///
+/// [#2394]: https://github.com/apache/arrow-rs/issues/2394
+/// [ColumnReader]: crate::file::reader::RowGroupReader::get_column_reader
+/// [Dremel]: https://research.google/pubs/pub36632/
 pub struct RowIter<'a> {
     descr: SchemaDescPtr,
     tree_builder: TreeBuilder,