From 7d9563181f89803356da51f1855ced0ef3d5cf6c Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Fri, 17 Nov 2023 15:03:47 -0800 Subject: [PATCH 1/6] Expand parquet crate overview doc --- parquet/CONTRIBUTING.md | 4 +- parquet/src/lib.rs | 82 ++++++++++++++++++++++++++++++++--------- 2 files changed, 66 insertions(+), 20 deletions(-) diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md index 5670eef08101..22093a654bb5 100644 --- a/parquet/CONTRIBUTING.md +++ b/parquet/CONTRIBUTING.md @@ -57,8 +57,8 @@ Run `cargo bench` for benchmarks. ## Docs -To build documentation, run `cargo doc --no-deps`. -To compile and view in the browser, run `cargo doc --no-deps --open`. +To build documentation, run `cargo doc --no-deps --features object_store`. +To compile and view in the browser, run `cargo doc --no-deps --features object_store --open`. ## Update Parquet Format diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 0279bbc382ea..4e53f86171a4 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -15,24 +15,70 @@ // specific language governing permissions and limitations // under the License. -//! This crate contains the official Native Rust implementation of -//! [Apache Parquet](https://parquet.apache.org/), part of -//! the [Apache Arrow](https://arrow.apache.org/) project. -//! -//! Please see the [parquet crates.io](https://crates.io/crates/parquet) -//! page for feature flags and tips to improve performance. -//! -//! # Getting Started -//! Start with some examples: -//! -//! 1. [mod@file] for reading and writing parquet files using the -//! [ColumnReader](column::reader::ColumnReader) API. -//! -//! 2. [arrow] for reading and writing parquet files to Arrow -//! `RecordBatch`es -//! -//! 3. [arrow::async_reader] and [arrow::async_writer] for `async` reading -//! and writing parquet files to Arrow `RecordBatch`es (requires the `async` feature). +/*! +This crate contains the official Native Rust implementation of +[Apache Parquet](https://parquet.apache.org/), part of +the [Apache Arrow](https://arrow.apache.org/) project. +The crate provides a number of APIs to read and write Parquet files, +covering a range of use cases. + +Please see the [parquet crates.io](https://crates.io/crates/parquet) +page for feature flags and tips to improve performance. + +# Getting Started + +## Format Overview + +Parquet is a columnar format, which means that unlike row formats like +the [CSV format](https://en.wikipedia.org/wiki/Comma-separated_values) for instance, +values are iterated along columns instead of rows. Parquet is similar in spirit to +[Arrow](https://arrow.apache.org/), with Parquet focusing on storage +efficiency while Arrow prioritizes compute efficiency. + +Parquet files are partitioned for scalability. Each file contains metadata, +along with zero or more "row groups", each row group containing one or +more columns. The APIs in this crate reflect this structure. + +Parquet distinguishes between "logical" and "physical" data types. +For instance, strings (logical type) are stored as byte arrays (physical type). +Likewise, temporal types like dates, times, timestamps, etc. (logical type) +are stored as integers (physical type). This crate exposes both kinds of types. + +For more details about the Parquet format, see the +[Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format). + +## APIs + +This crate exposes both low-level and high-level APIs, organized as follows: + +1. The [`arrow`] module reads and writes Parquet data to/from Arrow +`RecordBatch`es. This is the recommended high-level API. It allows leveraging +the wide range of data transforms provided by the +[arrow](https://docs.rs/arrow/latest/arrow/index.html) crate and by the ecosystem +of libraries and services using Arrow as a interop format. + +2. The [`mod@file`] module allows reading and writing Parquet files without taking a +dependency on Arrow. This is the recommended low-level API. Parquet files are +read and written one row group at a time by calling respectively +[`SerializedFileReader::get_row_group`](file::serialized_reader::SerializedFileReader) + and [`SerializedFileWriter::next_row_group`](file::writer::SerializedFileWriter). +Within each row group, columns are read and written one at a time using +respectively [`ColumnReader`](column::reader::ColumnReader) and +[`ColumnWriter`](column::writer::ColumnWriter). The [`mod@file`] module also allows +reading files in a row-wise manner via +[`SerializedFileReader::get_row_iter`](file::serialized_reader::SerializedFileReader). +This is a convenience API which favors simplicity over performance and completeness. +It is not recommended for production use. + +3. Within the [`arrow`] module, async reading and writing is provided by +[`arrow::async_reader`] and [`arrow::async_writer`]. These APIs are more advanced and +require the `async` feature. Within this module, +[`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader) +enables connecting directly to the Cloud Provider storage services of AWS, Azure, GCP +and the likes via the [object_store](https://docs.rs/object_store/latest/object_store/) +crate, enabling network-bandwidth optimizations via predicate and projection push-downs. + +*/ /// Defines a an item with an experimental public API /// From d37c059d890c01d805a0793dfc09b55875ad8445 Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Fri, 17 Nov 2023 15:27:37 -0800 Subject: [PATCH 2/6] Run `cargo fmt --all` --- parquet/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 4e53f86171a4..a30bd177eabb 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -31,7 +31,7 @@ page for feature flags and tips to improve performance. Parquet is a columnar format, which means that unlike row formats like the [CSV format](https://en.wikipedia.org/wiki/Comma-separated_values) for instance, -values are iterated along columns instead of rows. Parquet is similar in spirit to +values are iterated along columns instead of rows. Parquet is similar in spirit to [Arrow](https://arrow.apache.org/), with Parquet focusing on storage efficiency while Arrow prioritizes compute efficiency. @@ -63,7 +63,7 @@ read and written one row group at a time by calling respectively [`SerializedFileReader::get_row_group`](file::serialized_reader::SerializedFileReader) and [`SerializedFileWriter::next_row_group`](file::writer::SerializedFileWriter). Within each row group, columns are read and written one at a time using -respectively [`ColumnReader`](column::reader::ColumnReader) and +respectively [`ColumnReader`](column::reader::ColumnReader) and [`ColumnWriter`](column::writer::ColumnWriter). The [`mod@file`] module also allows reading files in a row-wise manner via [`SerializedFileReader::get_row_iter`](file::serialized_reader::SerializedFileReader). From e8094fefc6c8bafe91cb63be7f7db220523a4348 Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Fri, 17 Nov 2023 16:56:24 -0800 Subject: [PATCH 3/6] Nit: ask to format the code before sending a PR --- parquet/CONTRIBUTING.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md index 22093a654bb5..8e1a7a188fa4 100644 --- a/parquet/CONTRIBUTING.md +++ b/parquet/CONTRIBUTING.md @@ -60,6 +60,8 @@ Run `cargo bench` for benchmarks. To build documentation, run `cargo doc --no-deps --features object_store`. To compile and view in the browser, run `cargo doc --no-deps --features object_store --open`. +Before submitting a pull request, run `cargo fmt --all` to format the change. + ## Update Parquet Format To generate the parquet format (thrift definitions) code run [`./regen.sh`](./regen.sh). From 7f698a5e852111da30a59ba29c1d2a2171d8da46 Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Sat, 18 Nov 2023 08:43:13 -0800 Subject: [PATCH 4/6] Add example reading Parquet files from cloud provider --- parquet/Cargo.toml | 6 +++ parquet/examples/object_store_read_parquet.rs | 51 +++++++++++++++++++ parquet/src/arrow/async_reader/store.rs | 37 +++++++++++++- 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 parquet/examples/object_store_read_parquet.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index bdcbcb81cfce..c75d237a70b0 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -81,6 +81,7 @@ serde_json = { version = "1.0", features = ["std"], default-features = false } arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +object_store = { version = "0.8", default-features = false, features = ["azure"] } [package.metadata.docs.rs] all-features = true @@ -114,6 +115,11 @@ name = "async_read_parquet" required-features = ["arrow", "async"] path = "./examples/async_read_parquet.rs" +[[example]] +name = "object_store_read_parquet" +required-features = ["arrow", "object_store"] +path = "./examples/object_store_read_parquet.rs" + [[example]] name = "read_with_rowgroup" required-features = ["arrow", "async"] diff --git a/parquet/examples/object_store_read_parquet.rs b/parquet/examples/object_store_read_parquet.rs new file mode 100644 index 000000000000..acebeb307113 --- /dev/null +++ b/parquet/examples/object_store_read_parquet.rs @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use object_store::azure::MicrosoftAzureBuilder; +use object_store::path::Path; +use object_store::ObjectStore; +use parquet::arrow::async_reader::ParquetObjectReader; +use parquet::arrow::ParquetRecordBatchStreamBuilder; +use parquet::schema::printer::print_parquet_metadata; +use std::error::Error; +use std::io::stdout; +use std::sync::Arc; + +#[tokio::main(flavor = "current_thread")] +async fn main() -> Result<(), Box> { + // Open Azure Storage Blob https://myaccount.blob.core.windows.net/mycontainer/path/to/blob.parquet + // Requires running Azure CLI `az login` beforehand to setup authentication + let storage_container = Arc::new( + MicrosoftAzureBuilder::new() + .with_account("myaccount") + .with_container_name("mycontainer") + .with_use_azure_cli(true) + .build()?, + ); + let blob = storage_container + .get(&Path::from("path/to/blob.parquet")) + .await? + .meta; + println!("Found Blob with {}B at {}", blob.size, blob.location); + + // Show Parquet metadata + let reader = ParquetObjectReader::new(storage_container, blob); + let builder = ParquetRecordBatchStreamBuilder::new(reader).await?; + print_parquet_metadata(&mut stdout(), builder.metadata()); + + Ok(()) +} diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 3e27a96124b0..bf757ad85be4 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -28,7 +28,42 @@ use crate::arrow::async_reader::{AsyncFileReader, MetadataLoader}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ParquetMetaData; -/// Implements [`AsyncFileReader`] for a parquet file in object storage +/** +Reads Parquet files in object storage using [`ObjectStore`]. + +Example reading a Parquet file from Azure Blob Storage + +```rust +use object_store::azure::MicrosoftAzureBuilder; +use object_store::path::Path; +use object_store::ObjectStore; +# use parquet::arrow::async_reader::ParquetObjectReader; +# use parquet::arrow::ParquetRecordBatchStreamBuilder; +# use parquet::schema::printer::print_parquet_metadata; +# use std::error::Error; +# use std::io::stdout; +# use std::sync::Arc; + +// Open Azure Storage Blob https://myaccount.blob.core.windows.net/mycontainer/path/to/blob.parquet +// Requires running Azure CLI `az login` beforehand to setup authentication +let storage_container = Arc::new( + MicrosoftAzureBuilder::new() + .with_account("myaccount") + .with_container_name("mycontainer") + .with_use_azure_cli(true) + .build()?); +let blob = storage_container + .get(&Path::from("path/to/blob.parquet")) + .await? + .meta; +println!("Found Blob with {}B at {}", blob.size, blob.location); + +// Show Parquet metadata +let reader = ParquetObjectReader::new(storage_container, blob); +let builder = ParquetRecordBatchStreamBuilder::new(reader).await?; +print_parquet_metadata(&mut stdout(), builder.metadata()); +``` +*/ #[derive(Clone, Debug)] pub struct ParquetObjectReader { store: Arc, From b98213b78c4acf85c3f2b2debdb00d818fa485cd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Sat, 18 Nov 2023 19:50:29 +0000 Subject: [PATCH 5/6] Tweak copy --- parquet/CONTRIBUTING.md | 4 +- parquet/Cargo.toml | 5 - parquet/README.md | 8 -- parquet/examples/object_store_read_parquet.rs | 51 ------- parquet/src/arrow/async_reader/store.rs | 60 ++++----- parquet/src/arrow/mod.rs | 2 +- parquet/src/file/mod.rs | 2 +- parquet/src/file/reader.rs | 4 +- parquet/src/lib.rs | 125 +++++++++--------- parquet/src/record/reader.rs | 17 ++- 10 files changed, 105 insertions(+), 173 deletions(-) delete mode 100644 parquet/examples/object_store_read_parquet.rs diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md index 8e1a7a188fa4..922332b15d64 100644 --- a/parquet/CONTRIBUTING.md +++ b/parquet/CONTRIBUTING.md @@ -57,8 +57,8 @@ Run `cargo bench` for benchmarks. ## Docs -To build documentation, run `cargo doc --no-deps --features object_store`. -To compile and view in the browser, run `cargo doc --no-deps --features object_store --open`. +To build documentation, run `cargo doc --no-deps --all-features`. +To compile and view in the browser, run `cargo doc --no-deps --all-features --open`. Before submitting a pull request, run `cargo fmt --all` to format the change. diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index c75d237a70b0..4cd03c051e62 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -115,11 +115,6 @@ name = "async_read_parquet" required-features = ["arrow", "async"] path = "./examples/async_read_parquet.rs" -[[example]] -name = "object_store_read_parquet" -required-features = ["arrow", "object_store"] -path = "./examples/object_store_read_parquet.rs" - [[example]] name = "read_with_rowgroup" required-features = ["arrow", "async"] diff --git a/parquet/README.md b/parquet/README.md index 2e0ab1d52c30..9de7aec4e59a 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -71,14 +71,6 @@ The `parquet` crate provides the following features which may be enabled in your - [x] Predicate pushdown - [x] Parquet format 4.0.0 support -## Support for `wasm32-unknown-unknown` target - -It's possible to build `parquet` for the `wasm32-unknown-unknown` target, however not all the compression features are currently unsupported due to issues with the upstream crates. In particular, the `zstd` and `lz4` features may have compilation issues. See issue [#180](https://github.com/apache/arrow-rs/issues/180). - -``` -cargo build -p parquet --target wasm32-unknown-unknown --no-default-features --features cli,snap,flate2,brotli -``` - ## License Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0. diff --git a/parquet/examples/object_store_read_parquet.rs b/parquet/examples/object_store_read_parquet.rs deleted file mode 100644 index acebeb307113..000000000000 --- a/parquet/examples/object_store_read_parquet.rs +++ /dev/null @@ -1,51 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use object_store::azure::MicrosoftAzureBuilder; -use object_store::path::Path; -use object_store::ObjectStore; -use parquet::arrow::async_reader::ParquetObjectReader; -use parquet::arrow::ParquetRecordBatchStreamBuilder; -use parquet::schema::printer::print_parquet_metadata; -use std::error::Error; -use std::io::stdout; -use std::sync::Arc; - -#[tokio::main(flavor = "current_thread")] -async fn main() -> Result<(), Box> { - // Open Azure Storage Blob https://myaccount.blob.core.windows.net/mycontainer/path/to/blob.parquet - // Requires running Azure CLI `az login` beforehand to setup authentication - let storage_container = Arc::new( - MicrosoftAzureBuilder::new() - .with_account("myaccount") - .with_container_name("mycontainer") - .with_use_azure_cli(true) - .build()?, - ); - let blob = storage_container - .get(&Path::from("path/to/blob.parquet")) - .await? - .meta; - println!("Found Blob with {}B at {}", blob.size, blob.location); - - // Show Parquet metadata - let reader = ParquetObjectReader::new(storage_container, blob); - let builder = ParquetRecordBatchStreamBuilder::new(reader).await?; - print_parquet_metadata(&mut stdout(), builder.metadata()); - - Ok(()) -} diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index bf757ad85be4..dce394cdb4d8 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -28,42 +28,30 @@ use crate::arrow::async_reader::{AsyncFileReader, MetadataLoader}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ParquetMetaData; -/** -Reads Parquet files in object storage using [`ObjectStore`]. - -Example reading a Parquet file from Azure Blob Storage - -```rust -use object_store::azure::MicrosoftAzureBuilder; -use object_store::path::Path; -use object_store::ObjectStore; -# use parquet::arrow::async_reader::ParquetObjectReader; -# use parquet::arrow::ParquetRecordBatchStreamBuilder; -# use parquet::schema::printer::print_parquet_metadata; -# use std::error::Error; -# use std::io::stdout; -# use std::sync::Arc; - -// Open Azure Storage Blob https://myaccount.blob.core.windows.net/mycontainer/path/to/blob.parquet -// Requires running Azure CLI `az login` beforehand to setup authentication -let storage_container = Arc::new( - MicrosoftAzureBuilder::new() - .with_account("myaccount") - .with_container_name("mycontainer") - .with_use_azure_cli(true) - .build()?); -let blob = storage_container - .get(&Path::from("path/to/blob.parquet")) - .await? - .meta; -println!("Found Blob with {}B at {}", blob.size, blob.location); - -// Show Parquet metadata -let reader = ParquetObjectReader::new(storage_container, blob); -let builder = ParquetRecordBatchStreamBuilder::new(reader).await?; -print_parquet_metadata(&mut stdout(), builder.metadata()); -``` -*/ +/// Reads Parquet files in object storage using [`ObjectStore`]. +/// +/// ```no_run +/// # use std::io::stdout; +/// # use std::sync::Arc; +/// # use object_store::azure::MicrosoftAzureBuilder; +/// # use object_store::ObjectStore; +/// # use object_store::path::Path; +/// # use parquet::arrow::async_reader::ParquetObjectReader; +/// # use parquet::arrow::ParquetRecordBatchStreamBuilder; +/// # use parquet::schema::printer::print_parquet_metadata; +/// # async fn main() { +/// // Populate configuration from environment +/// let storage_container = Arc::new(MicrosoftAzureBuilder::from_env().build()?); +/// let location = Path::from("path/to/blob.parquet"); +/// let meta = storage_container.head(&location).await?; +/// println!("Found Blob with {}B at {}", meta.size, meta.location); +/// +/// // Show Parquet metadata +/// let reader = ParquetObjectReader::new(storage_container, meta); +/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await?; +/// print_parquet_metadata(&mut stdout(), builder.metadata()); +/// # } +/// ``` #[derive(Clone, Debug)] pub struct ParquetObjectReader { store: Arc, diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 63885643c0fd..950226aef721 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Provides API for reading/writing Arrow +//! High-level API for reading/writing Arrow //! [RecordBatch](arrow_array::RecordBatch)es and //! [Array](arrow_array::Array)s to/from Parquet Files. //! diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index c20fd38c7f8b..6589d2efaf8b 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Main entrypoint for working with Parquet API. +//! Low level APIs for reading raw parquet data. //! //! Provides access to file and row group readers and writers, record API, metadata, etc. //! diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 921f9df290cc..dd6a0fdd2312 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -134,7 +134,7 @@ pub trait FileReader: Send + Sync { /// Get the `i`th row group reader. Note this doesn't do bound check. fn get_row_group(&self, i: usize) -> Result>; - /// Get full iterator of `Row`s from a file (over all row groups). + /// Get an iterator over the row in this file, see [`RowIter`] for caveats. /// /// Iterator will automatically load the next row group to advance. /// @@ -194,7 +194,7 @@ pub trait RowGroupReader: Send + Sync { /// to read bloom filters. fn get_column_bloom_filter(&self, i: usize) -> Option<&Sbbf>; - /// Get iterator of `Row`s from this row group. + /// Get an iterator over the row in this file, see [`RowIter`] for caveats. /// /// Projected schema can be a subset of or equal to the file schema, when it is None, /// full file schema is assumed. diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index a30bd177eabb..db5d72634389 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -15,70 +15,67 @@ // specific language governing permissions and limitations // under the License. -/*! -This crate contains the official Native Rust implementation of -[Apache Parquet](https://parquet.apache.org/), part of -the [Apache Arrow](https://arrow.apache.org/) project. -The crate provides a number of APIs to read and write Parquet files, -covering a range of use cases. - -Please see the [parquet crates.io](https://crates.io/crates/parquet) -page for feature flags and tips to improve performance. - -# Getting Started - -## Format Overview - -Parquet is a columnar format, which means that unlike row formats like -the [CSV format](https://en.wikipedia.org/wiki/Comma-separated_values) for instance, -values are iterated along columns instead of rows. Parquet is similar in spirit to -[Arrow](https://arrow.apache.org/), with Parquet focusing on storage -efficiency while Arrow prioritizes compute efficiency. - -Parquet files are partitioned for scalability. Each file contains metadata, -along with zero or more "row groups", each row group containing one or -more columns. The APIs in this crate reflect this structure. - -Parquet distinguishes between "logical" and "physical" data types. -For instance, strings (logical type) are stored as byte arrays (physical type). -Likewise, temporal types like dates, times, timestamps, etc. (logical type) -are stored as integers (physical type). This crate exposes both kinds of types. - -For more details about the Parquet format, see the -[Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format). - -## APIs - -This crate exposes both low-level and high-level APIs, organized as follows: - -1. The [`arrow`] module reads and writes Parquet data to/from Arrow -`RecordBatch`es. This is the recommended high-level API. It allows leveraging -the wide range of data transforms provided by the -[arrow](https://docs.rs/arrow/latest/arrow/index.html) crate and by the ecosystem -of libraries and services using Arrow as a interop format. - -2. The [`mod@file`] module allows reading and writing Parquet files without taking a -dependency on Arrow. This is the recommended low-level API. Parquet files are -read and written one row group at a time by calling respectively -[`SerializedFileReader::get_row_group`](file::serialized_reader::SerializedFileReader) - and [`SerializedFileWriter::next_row_group`](file::writer::SerializedFileWriter). -Within each row group, columns are read and written one at a time using -respectively [`ColumnReader`](column::reader::ColumnReader) and -[`ColumnWriter`](column::writer::ColumnWriter). The [`mod@file`] module also allows -reading files in a row-wise manner via -[`SerializedFileReader::get_row_iter`](file::serialized_reader::SerializedFileReader). -This is a convenience API which favors simplicity over performance and completeness. -It is not recommended for production use. - -3. Within the [`arrow`] module, async reading and writing is provided by -[`arrow::async_reader`] and [`arrow::async_writer`]. These APIs are more advanced and -require the `async` feature. Within this module, -[`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader) -enables connecting directly to the Cloud Provider storage services of AWS, Azure, GCP -and the likes via the [object_store](https://docs.rs/object_store/latest/object_store/) -crate, enabling network-bandwidth optimizations via predicate and projection push-downs. - -*/ +//! +//! This crate contains the official Native Rust implementation of +//! [Apache Parquet](https://parquet.apache.org/), part of +//! the [Apache Arrow](https://arrow.apache.org/) project. +//! The crate provides a number of APIs to read and write Parquet files, +//! covering a range of use cases. +//! +//! Please see the [parquet crates.io](https://crates.io/crates/parquet) +//! page for feature flags and tips to improve performance. +//! +//! # Format Overview +//! +//! Parquet is a columnar format, which means that unlike row formats like [CSV], values are +//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], with Parquet +//! focusing on storage efficiency whereas Arrow prioritizes compute efficiency. +//! +//! Parquet files are partitioned for scalability. Each file contains metadata, +//! along with zero or more "row groups", each row group containing one or +//! more columns. The APIs in this crate reflect this structure. +//! +//! Parquet distinguishes between "logical" and "physical" data types. +//! For instance, strings (logical type) are stored as byte arrays (physical type). +//! Likewise, temporal types like dates, times, timestamps, etc. (logical type) +//! are stored as integers (physical type). This crate exposes both kinds of types. +//! +//! For more details about the Parquet format, see the +//! [Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format). +//! +//! # APIs +//! +//! This crate exposes a number of APIs for different use-cases. +//! +//! ## Read/Write Arrow +//! +//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow `RecordBatch`. +//! This makes for a simple and performant interface to parquet data, whilst allowing workloads +//! to leverage the wide range of data transforms provided by the [arrow] crate, and by the +//! ecosystem of libraries and services using [Arrow] as an interop format. +//! +//! ## Read/Write Arrow Async +//! +//! When the `async` feature is enabled, [`arrow::async_reader`] and [`arrow::async_writer`] +//! provide the ability to read and write [`arrow`] data asynchronously. Additionally, with the +//! `object_store` feature is enabled, [`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader) +//! provides efficient integration with object storage services such as S3 via the [object_store] +//! crate, automatically optimizing IO based on any predicates or projections provided. +//! +//! ## Read/Write Parquet +//! +//! Workloads needing finer-grained control, or looking to not take a dependency on arrow, +//! can use the lower-level APIs in [`mod@file`]. These APIs expose the underlying parquet +//! data model, and therefore require knowledge of the underlying parquet format, +//! including the details of [Dremel] record shredding and [Logical Types]. Most workloads +//! should prefer the arrow interfaces. +//! +//! [arrow]: https://docs.rs/arrow/latest/arrow/index.html +//! [Arrow]: https://arrow.apache.org/ +//! [CSV]: https://en.wikipedia.org/wiki/Comma-separated_values +//! [Dremel]: https://research.google/pubs/pub36632/ +//! [Logical Types]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md +//! [object_store]: https://docs.rs/object_store/latest/object_store/ /// Defines a an item with an experimental public API /// diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index f98939725517..feaa8055e2dd 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -609,9 +609,20 @@ impl<'a> Either<'a> { } } -/// Iterator of [`Row`]s. -/// It is used either for a single row group to iterate over data in that row group, or -/// an entire file with auto buffering of all row groups. +/// Access parquet data as an iterator of [`Row`] +/// +/// # Caveats +/// +/// Parquet stores data in a columnar fashion using [Dremel] encoding, and is therefore highly +/// optimised for reading data by column, not row. As a consequence applications concerned with +/// performance should prefer the columnar arrow or [ColumnReader] APIs. +/// +/// Additionally the current implementation does not correctly handle repeated fields ([#2394]), +/// and workloads looking to handle such schema should use the other APIs. +/// +/// [#2394]: https://github.com/apache/arrow-rs/issues/2394 +/// [ColumnReader]: crate::file::reader::RowGroupReader::get_column_reader +/// [Dremel]: https://research.google/pubs/pub36632/ pub struct RowIter<'a> { descr: SchemaDescPtr, tree_builder: TreeBuilder, From b7c95abe0c95d24a92934fb77e12c3517be392d1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Sat, 18 Nov 2023 19:57:50 +0000 Subject: [PATCH 6/6] Fix doctest --- parquet/src/arrow/async_reader/store.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index dce394cdb4d8..293b91aea3ba 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -39,16 +39,16 @@ use crate::file::metadata::ParquetMetaData; /// # use parquet::arrow::async_reader::ParquetObjectReader; /// # use parquet::arrow::ParquetRecordBatchStreamBuilder; /// # use parquet::schema::printer::print_parquet_metadata; -/// # async fn main() { +/// # async fn run() { /// // Populate configuration from environment -/// let storage_container = Arc::new(MicrosoftAzureBuilder::from_env().build()?); +/// let storage_container = Arc::new(MicrosoftAzureBuilder::from_env().build().unwrap()); /// let location = Path::from("path/to/blob.parquet"); -/// let meta = storage_container.head(&location).await?; +/// let meta = storage_container.head(&location).await.unwrap(); /// println!("Found Blob with {}B at {}", meta.size, meta.location); /// /// // Show Parquet metadata /// let reader = ParquetObjectReader::new(storage_container, meta); -/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await?; +/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap(); /// print_parquet_metadata(&mut stdout(), builder.metadata()); /// # } /// ```