From 7d9563181f89803356da51f1855ced0ef3d5cf6c Mon Sep 17 00:00:00 2001
From: Matthieu Maitre <mmaitre@microsoft.com>
Date: Fri, 17 Nov 2023 15:03:47 -0800
Subject: [PATCH 1/6] Expand parquet crate overview doc

---
 parquet/CONTRIBUTING.md |  4 +-
 parquet/src/lib.rs      | 82 ++++++++++++++++++++++++++++++++---------
 2 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md
index 5670eef08101..22093a654bb5 100644
--- a/parquet/CONTRIBUTING.md
+++ b/parquet/CONTRIBUTING.md
@@ -57,8 +57,8 @@ Run `cargo bench` for benchmarks.
 
 ## Docs
 
-To build documentation, run `cargo doc --no-deps`.
-To compile and view in the browser, run `cargo doc --no-deps --open`.
+To build documentation, run `cargo doc --no-deps --features object_store`.
+To compile and view in the browser, run `cargo doc --no-deps --features object_store --open`.
 
 ## Update Parquet Format
 
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 0279bbc382ea..4e53f86171a4 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -15,24 +15,70 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! This crate contains the official Native Rust implementation of
-//! [Apache Parquet](https://parquet.apache.org/), part of
-//! the [Apache Arrow](https://arrow.apache.org/) project.
-//!
-//! Please see the [parquet crates.io](https://crates.io/crates/parquet)
-//! page for feature flags and tips to improve performance.
-//!
-//! # Getting Started
-//! Start with some examples:
-//!
-//! 1. [mod@file] for reading and writing parquet files using the
-//! [ColumnReader](column::reader::ColumnReader) API.
-//!
-//! 2. [arrow] for reading and writing parquet files to Arrow
-//! `RecordBatch`es
-//!
-//! 3. [arrow::async_reader] and [arrow::async_writer] for `async` reading
-//! and writing parquet files to Arrow `RecordBatch`es (requires the `async` feature).
+/*!
+This crate contains the official Native Rust implementation of
+[Apache Parquet](https://parquet.apache.org/), part of
+the [Apache Arrow](https://arrow.apache.org/) project.
+The crate provides a number of APIs to read and write Parquet files,
+covering a range of use cases.
+
+Please see the [parquet crates.io](https://crates.io/crates/parquet)
+page for feature flags and tips to improve performance.
+
+# Getting Started
+
+## Format Overview
+
+Parquet is a columnar format, which means that unlike row formats like
+the [CSV format](https://en.wikipedia.org/wiki/Comma-separated_values) for instance,
+values are iterated along columns instead of rows. Parquet is similar in spirit to 
+[Arrow](https://arrow.apache.org/), with Parquet focusing on storage
+efficiency while Arrow prioritizes compute efficiency.
+
+Parquet files are partitioned for scalability. Each file contains metadata,
+along with zero or more "row groups", each row group containing one or
+more columns. The APIs in this crate reflect this structure.
+
+Parquet distinguishes between "logical" and "physical" data types.
+For instance, strings (logical type) are stored as byte arrays (physical type).
+Likewise, temporal types like dates, times, timestamps, etc. (logical type)
+are stored as integers (physical type). This crate exposes both kinds of types.
+
+For more details about the Parquet format, see the
+[Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format).
+
+## APIs
+
+This crate exposes both low-level and high-level APIs, organized as follows:
+
+1. The [`arrow`] module reads and writes Parquet data to/from Arrow
+`RecordBatch`es. This is the recommended high-level API. It allows leveraging
+the wide range of data transforms provided by the
+[arrow](https://docs.rs/arrow/latest/arrow/index.html) crate and by the ecosystem
+of libraries and services using Arrow as a interop format.
+
+2. The [`mod@file`] module allows reading and writing Parquet files without taking a
+dependency on Arrow. This is the recommended low-level API. Parquet files are
+read and written one row group at a time by calling respectively
+[`SerializedFileReader::get_row_group`](file::serialized_reader::SerializedFileReader)
+ and [`SerializedFileWriter::next_row_group`](file::writer::SerializedFileWriter).
+Within each row group, columns are read and written one at a time using
+respectively [`ColumnReader`](column::reader::ColumnReader) and 
+[`ColumnWriter`](column::writer::ColumnWriter). The [`mod@file`] module also allows
+reading files in a row-wise manner via
+[`SerializedFileReader::get_row_iter`](file::serialized_reader::SerializedFileReader).
+This is a convenience API which favors simplicity over performance and completeness.
+It is not recommended for production use.
+
+3. Within the [`arrow`] module, async reading and writing is provided by
+[`arrow::async_reader`] and [`arrow::async_writer`]. These APIs are more advanced and
+require the `async` feature. Within this module,
+[`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader)
+enables connecting directly to the Cloud Provider storage services of AWS, Azure, GCP
+and the likes via the [object_store](https://docs.rs/object_store/latest/object_store/)
+crate, enabling network-bandwidth optimizations via predicate and projection push-downs.
+
+*/
 
 /// Defines a an item with an experimental public API
 ///

From d37c059d890c01d805a0793dfc09b55875ad8445 Mon Sep 17 00:00:00 2001
From: Matthieu Maitre <mmaitre@microsoft.com>
Date: Fri, 17 Nov 2023 15:27:37 -0800
Subject: [PATCH 2/6] Run `cargo fmt --all`

---
 parquet/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 4e53f86171a4..a30bd177eabb 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -31,7 +31,7 @@ page for feature flags and tips to improve performance.
 
 Parquet is a columnar format, which means that unlike row formats like
 the [CSV format](https://en.wikipedia.org/wiki/Comma-separated_values) for instance,
-values are iterated along columns instead of rows. Parquet is similar in spirit to 
+values are iterated along columns instead of rows. Parquet is similar in spirit to
 [Arrow](https://arrow.apache.org/), with Parquet focusing on storage
 efficiency while Arrow prioritizes compute efficiency.
 
@@ -63,7 +63,7 @@ read and written one row group at a time by calling respectively
 [`SerializedFileReader::get_row_group`](file::serialized_reader::SerializedFileReader)
  and [`SerializedFileWriter::next_row_group`](file::writer::SerializedFileWriter).
 Within each row group, columns are read and written one at a time using
-respectively [`ColumnReader`](column::reader::ColumnReader) and 
+respectively [`ColumnReader`](column::reader::ColumnReader) and
 [`ColumnWriter`](column::writer::ColumnWriter). The [`mod@file`] module also allows
 reading files in a row-wise manner via
 [`SerializedFileReader::get_row_iter`](file::serialized_reader::SerializedFileReader).

From e8094fefc6c8bafe91cb63be7f7db220523a4348 Mon Sep 17 00:00:00 2001
From: Matthieu Maitre <mmaitre@microsoft.com>
Date: Fri, 17 Nov 2023 16:56:24 -0800
Subject: [PATCH 3/6] Nit: ask to format the code before sending a PR

---
 parquet/CONTRIBUTING.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md
index 22093a654bb5..8e1a7a188fa4 100644
--- a/parquet/CONTRIBUTING.md
+++ b/parquet/CONTRIBUTING.md
@@ -60,6 +60,8 @@ Run `cargo bench` for benchmarks.
 To build documentation, run `cargo doc --no-deps --features object_store`.
 To compile and view in the browser, run `cargo doc --no-deps --features object_store --open`.
 
+Before submitting a pull request, run `cargo fmt --all` to format the change.
+
 ## Update Parquet Format
 
 To generate the parquet format (thrift definitions) code run [`./regen.sh`](./regen.sh).

From 7f698a5e852111da30a59ba29c1d2a2171d8da46 Mon Sep 17 00:00:00 2001
From: Matthieu Maitre <mmaitre@microsoft.com>
Date: Sat, 18 Nov 2023 08:43:13 -0800
Subject: [PATCH 4/6] Add example reading Parquet files from cloud provider

---
 parquet/Cargo.toml                            |  6 +++
 parquet/examples/object_store_read_parquet.rs | 51 +++++++++++++++++++
 parquet/src/arrow/async_reader/store.rs       | 37 +++++++++++++-
 3 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 parquet/examples/object_store_read_parquet.rs

diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index bdcbcb81cfce..c75d237a70b0 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -81,6 +81,7 @@ serde_json = { version = "1.0", features = ["std"], default-features = false }
 arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint", "json"] }
 tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] }
 rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
+object_store = { version = "0.8", default-features = false, features = ["azure"] }
 
 [package.metadata.docs.rs]
 all-features = true
@@ -114,6 +115,11 @@ name = "async_read_parquet"
 required-features = ["arrow", "async"]
 path = "./examples/async_read_parquet.rs"
 
+[[example]]
+name = "object_store_read_parquet"
+required-features = ["arrow", "object_store"]
+path = "./examples/object_store_read_parquet.rs"
+
 [[example]]
 name = "read_with_rowgroup"
 required-features = ["arrow", "async"]
diff --git a/parquet/examples/object_store_read_parquet.rs b/parquet/examples/object_store_read_parquet.rs
new file mode 100644
index 000000000000..acebeb307113
--- /dev/null
+++ b/parquet/examples/object_store_read_parquet.rs
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use object_store::azure::MicrosoftAzureBuilder;
+use object_store::path::Path;
+use object_store::ObjectStore;
+use parquet::arrow::async_reader::ParquetObjectReader;
+use parquet::arrow::ParquetRecordBatchStreamBuilder;
+use parquet::schema::printer::print_parquet_metadata;
+use std::error::Error;
+use std::io::stdout;
+use std::sync::Arc;
+
+#[tokio::main(flavor = "current_thread")]
+async fn main() -> Result<(), Box<dyn Error>> {
+    // Open Azure Storage Blob https://myaccount.blob.core.windows.net/mycontainer/path/to/blob.parquet
+    // Requires running Azure CLI `az login` beforehand to setup authentication
+    let storage_container = Arc::new(
+        MicrosoftAzureBuilder::new()
+            .with_account("myaccount")
+            .with_container_name("mycontainer")
+            .with_use_azure_cli(true)
+            .build()?,
+    );
+    let blob = storage_container
+        .get(&Path::from("path/to/blob.parquet"))
+        .await?
+        .meta;
+    println!("Found Blob with {}B at {}", blob.size, blob.location);
+
+    // Show Parquet metadata
+    let reader = ParquetObjectReader::new(storage_container, blob);
+    let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
+    print_parquet_metadata(&mut stdout(), builder.metadata());
+
+    Ok(())
+}
diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs
index 3e27a96124b0..bf757ad85be4 100644
--- a/parquet/src/arrow/async_reader/store.rs
+++ b/parquet/src/arrow/async_reader/store.rs
@@ -28,7 +28,42 @@ use crate::arrow::async_reader::{AsyncFileReader, MetadataLoader};
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::ParquetMetaData;
 
-/// Implements [`AsyncFileReader`] for a parquet file in object storage
+/**
+Reads Parquet files in object storage using [`ObjectStore`].
+
+Example reading a Parquet file from Azure Blob Storage
+
+```rust
+use object_store::azure::MicrosoftAzureBuilder;
+use object_store::path::Path;
+use object_store::ObjectStore;
+# use parquet::arrow::async_reader::ParquetObjectReader;
+# use parquet::arrow::ParquetRecordBatchStreamBuilder;
+# use parquet::schema::printer::print_parquet_metadata;
+# use std::error::Error;
+# use std::io::stdout;
+# use std::sync::Arc;
+
+// Open Azure Storage Blob https://myaccount.blob.core.windows.net/mycontainer/path/to/blob.parquet
+// Requires running Azure CLI `az login` beforehand to setup authentication
+let storage_container = Arc::new(
+    MicrosoftAzureBuilder::new()
+        .with_account("myaccount")
+        .with_container_name("mycontainer")
+        .with_use_azure_cli(true)
+        .build()?);
+let blob = storage_container
+    .get(&Path::from("path/to/blob.parquet"))
+    .await?
+    .meta;
+println!("Found Blob with {}B at {}", blob.size, blob.location);
+
+// Show Parquet metadata
+let reader = ParquetObjectReader::new(storage_container, blob);
+let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
+print_parquet_metadata(&mut stdout(), builder.metadata());
+```
+*/
 #[derive(Clone, Debug)]
 pub struct ParquetObjectReader {
     store: Arc<dyn ObjectStore>,

From b98213b78c4acf85c3f2b2debdb00d818fa485cd Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <r.taylordavies@googlemail.com>
Date: Sat, 18 Nov 2023 19:50:29 +0000
Subject: [PATCH 5/6] Tweak copy

---
 parquet/CONTRIBUTING.md                       |   4 +-
 parquet/Cargo.toml                            |   5 -
 parquet/README.md                             |   8 --
 parquet/examples/object_store_read_parquet.rs |  51 -------
 parquet/src/arrow/async_reader/store.rs       |  60 ++++-----
 parquet/src/arrow/mod.rs                      |   2 +-
 parquet/src/file/mod.rs                       |   2 +-
 parquet/src/file/reader.rs                    |   4 +-
 parquet/src/lib.rs                            | 125 +++++++++---------
 parquet/src/record/reader.rs                  |  17 ++-
 10 files changed, 105 insertions(+), 173 deletions(-)
 delete mode 100644 parquet/examples/object_store_read_parquet.rs

diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md
index 8e1a7a188fa4..922332b15d64 100644
--- a/parquet/CONTRIBUTING.md
+++ b/parquet/CONTRIBUTING.md
@@ -57,8 +57,8 @@ Run `cargo bench` for benchmarks.
 
 ## Docs
 
-To build documentation, run `cargo doc --no-deps --features object_store`.
-To compile and view in the browser, run `cargo doc --no-deps --features object_store --open`.
+To build documentation, run `cargo doc --no-deps --all-features`.
+To compile and view in the browser, run `cargo doc --no-deps --all-features --open`.
 
 Before submitting a pull request, run `cargo fmt --all` to format the change.
 
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index c75d237a70b0..4cd03c051e62 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -115,11 +115,6 @@ name = "async_read_parquet"
 required-features = ["arrow", "async"]
 path = "./examples/async_read_parquet.rs"
 
-[[example]]
-name = "object_store_read_parquet"
-required-features = ["arrow", "object_store"]
-path = "./examples/object_store_read_parquet.rs"
-
 [[example]]
 name = "read_with_rowgroup"
 required-features = ["arrow", "async"]
diff --git a/parquet/README.md b/parquet/README.md
index 2e0ab1d52c30..9de7aec4e59a 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -71,14 +71,6 @@ The `parquet` crate provides the following features which may be enabled in your
 - [x] Predicate pushdown
 - [x] Parquet format 4.0.0 support
 
-## Support for `wasm32-unknown-unknown` target
-
-It's possible to build `parquet` for the `wasm32-unknown-unknown` target, however not all the compression features are currently unsupported due to issues with the upstream crates. In particular, the `zstd` and `lz4` features may have compilation issues. See issue [#180](https://github.com/apache/arrow-rs/issues/180).
-
-```
-cargo build -p parquet --target wasm32-unknown-unknown --no-default-features --features cli,snap,flate2,brotli
-```
-
 ## License
 
 Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0.
diff --git a/parquet/examples/object_store_read_parquet.rs b/parquet/examples/object_store_read_parquet.rs
deleted file mode 100644
index acebeb307113..000000000000
--- a/parquet/examples/object_store_read_parquet.rs
+++ /dev/null
@@ -1,51 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use object_store::azure::MicrosoftAzureBuilder;
-use object_store::path::Path;
-use object_store::ObjectStore;
-use parquet::arrow::async_reader::ParquetObjectReader;
-use parquet::arrow::ParquetRecordBatchStreamBuilder;
-use parquet::schema::printer::print_parquet_metadata;
-use std::error::Error;
-use std::io::stdout;
-use std::sync::Arc;
-
-#[tokio::main(flavor = "current_thread")]
-async fn main() -> Result<(), Box<dyn Error>> {
-    // Open Azure Storage Blob https://myaccount.blob.core.windows.net/mycontainer/path/to/blob.parquet
-    // Requires running Azure CLI `az login` beforehand to setup authentication
-    let storage_container = Arc::new(
-        MicrosoftAzureBuilder::new()
-            .with_account("myaccount")
-            .with_container_name("mycontainer")
-            .with_use_azure_cli(true)
-            .build()?,
-    );
-    let blob = storage_container
-        .get(&Path::from("path/to/blob.parquet"))
-        .await?
-        .meta;
-    println!("Found Blob with {}B at {}", blob.size, blob.location);
-
-    // Show Parquet metadata
-    let reader = ParquetObjectReader::new(storage_container, blob);
-    let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
-    print_parquet_metadata(&mut stdout(), builder.metadata());
-
-    Ok(())
-}
diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs
index bf757ad85be4..dce394cdb4d8 100644
--- a/parquet/src/arrow/async_reader/store.rs
+++ b/parquet/src/arrow/async_reader/store.rs
@@ -28,42 +28,30 @@ use crate::arrow::async_reader::{AsyncFileReader, MetadataLoader};
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::ParquetMetaData;
 
-/**
-Reads Parquet files in object storage using [`ObjectStore`].
-
-Example reading a Parquet file from Azure Blob Storage
-
-```rust
-use object_store::azure::MicrosoftAzureBuilder;
-use object_store::path::Path;
-use object_store::ObjectStore;
-# use parquet::arrow::async_reader::ParquetObjectReader;
-# use parquet::arrow::ParquetRecordBatchStreamBuilder;
-# use parquet::schema::printer::print_parquet_metadata;
-# use std::error::Error;
-# use std::io::stdout;
-# use std::sync::Arc;
-
-// Open Azure Storage Blob https://myaccount.blob.core.windows.net/mycontainer/path/to/blob.parquet
-// Requires running Azure CLI `az login` beforehand to setup authentication
-let storage_container = Arc::new(
-    MicrosoftAzureBuilder::new()
-        .with_account("myaccount")
-        .with_container_name("mycontainer")
-        .with_use_azure_cli(true)
-        .build()?);
-let blob = storage_container
-    .get(&Path::from("path/to/blob.parquet"))
-    .await?
-    .meta;
-println!("Found Blob with {}B at {}", blob.size, blob.location);
-
-// Show Parquet metadata
-let reader = ParquetObjectReader::new(storage_container, blob);
-let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
-print_parquet_metadata(&mut stdout(), builder.metadata());
-```
-*/
+/// Reads Parquet files in object storage using [`ObjectStore`].
+///
+/// ```no_run
+/// # use std::io::stdout;
+/// # use std::sync::Arc;
+/// # use object_store::azure::MicrosoftAzureBuilder;
+/// # use object_store::ObjectStore;
+/// # use object_store::path::Path;
+/// # use parquet::arrow::async_reader::ParquetObjectReader;
+/// # use parquet::arrow::ParquetRecordBatchStreamBuilder;
+/// # use parquet::schema::printer::print_parquet_metadata;
+/// # async fn main() {
+/// // Populate configuration from environment
+/// let storage_container = Arc::new(MicrosoftAzureBuilder::from_env().build()?);
+/// let location = Path::from("path/to/blob.parquet");
+/// let meta = storage_container.head(&location).await?;
+/// println!("Found Blob with {}B at {}", meta.size, meta.location);
+///
+/// // Show Parquet metadata
+/// let reader = ParquetObjectReader::new(storage_container, meta);
+/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
+/// print_parquet_metadata(&mut stdout(), builder.metadata());
+/// # }
+/// ```
 #[derive(Clone, Debug)]
 pub struct ParquetObjectReader {
     store: Arc<dyn ObjectStore>,
diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
index 63885643c0fd..950226aef721 100644
--- a/parquet/src/arrow/mod.rs
+++ b/parquet/src/arrow/mod.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Provides API for reading/writing Arrow
+//! High-level API for reading/writing Arrow
 //! [RecordBatch](arrow_array::RecordBatch)es and
 //! [Array](arrow_array::Array)s to/from Parquet Files.
 //!
diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs
index c20fd38c7f8b..6589d2efaf8b 100644
--- a/parquet/src/file/mod.rs
+++ b/parquet/src/file/mod.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Main entrypoint for working with Parquet API.
+//! Low level APIs for reading raw parquet data.
 //!
 //! Provides access to file and row group readers and writers, record API, metadata, etc.
 //!
diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs
index 921f9df290cc..dd6a0fdd2312 100644
--- a/parquet/src/file/reader.rs
+++ b/parquet/src/file/reader.rs
@@ -134,7 +134,7 @@ pub trait FileReader: Send + Sync {
     /// Get the `i`th row group reader. Note this doesn't do bound check.
     fn get_row_group(&self, i: usize) -> Result<Box<dyn RowGroupReader + '_>>;
 
-    /// Get full iterator of `Row`s from a file (over all row groups).
+    /// Get an iterator over the row in this file, see [`RowIter`] for caveats.
     ///
     /// Iterator will automatically load the next row group to advance.
     ///
@@ -194,7 +194,7 @@ pub trait RowGroupReader: Send + Sync {
     /// to read bloom filters.
     fn get_column_bloom_filter(&self, i: usize) -> Option<&Sbbf>;
 
-    /// Get iterator of `Row`s from this row group.
+    /// Get an iterator over the row in this file, see [`RowIter`] for caveats.
     ///
     /// Projected schema can be a subset of or equal to the file schema, when it is None,
     /// full file schema is assumed.
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index a30bd177eabb..db5d72634389 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -15,70 +15,67 @@
 // specific language governing permissions and limitations
 // under the License.
 
-/*!
-This crate contains the official Native Rust implementation of
-[Apache Parquet](https://parquet.apache.org/), part of
-the [Apache Arrow](https://arrow.apache.org/) project.
-The crate provides a number of APIs to read and write Parquet files,
-covering a range of use cases.
-
-Please see the [parquet crates.io](https://crates.io/crates/parquet)
-page for feature flags and tips to improve performance.
-
-# Getting Started
-
-## Format Overview
-
-Parquet is a columnar format, which means that unlike row formats like
-the [CSV format](https://en.wikipedia.org/wiki/Comma-separated_values) for instance,
-values are iterated along columns instead of rows. Parquet is similar in spirit to
-[Arrow](https://arrow.apache.org/), with Parquet focusing on storage
-efficiency while Arrow prioritizes compute efficiency.
-
-Parquet files are partitioned for scalability. Each file contains metadata,
-along with zero or more "row groups", each row group containing one or
-more columns. The APIs in this crate reflect this structure.
-
-Parquet distinguishes between "logical" and "physical" data types.
-For instance, strings (logical type) are stored as byte arrays (physical type).
-Likewise, temporal types like dates, times, timestamps, etc. (logical type)
-are stored as integers (physical type). This crate exposes both kinds of types.
-
-For more details about the Parquet format, see the
-[Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format).
-
-## APIs
-
-This crate exposes both low-level and high-level APIs, organized as follows:
-
-1. The [`arrow`] module reads and writes Parquet data to/from Arrow
-`RecordBatch`es. This is the recommended high-level API. It allows leveraging
-the wide range of data transforms provided by the
-[arrow](https://docs.rs/arrow/latest/arrow/index.html) crate and by the ecosystem
-of libraries and services using Arrow as a interop format.
-
-2. The [`mod@file`] module allows reading and writing Parquet files without taking a
-dependency on Arrow. This is the recommended low-level API. Parquet files are
-read and written one row group at a time by calling respectively
-[`SerializedFileReader::get_row_group`](file::serialized_reader::SerializedFileReader)
- and [`SerializedFileWriter::next_row_group`](file::writer::SerializedFileWriter).
-Within each row group, columns are read and written one at a time using
-respectively [`ColumnReader`](column::reader::ColumnReader) and
-[`ColumnWriter`](column::writer::ColumnWriter). The [`mod@file`] module also allows
-reading files in a row-wise manner via
-[`SerializedFileReader::get_row_iter`](file::serialized_reader::SerializedFileReader).
-This is a convenience API which favors simplicity over performance and completeness.
-It is not recommended for production use.
-
-3. Within the [`arrow`] module, async reading and writing is provided by
-[`arrow::async_reader`] and [`arrow::async_writer`]. These APIs are more advanced and
-require the `async` feature. Within this module,
-[`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader)
-enables connecting directly to the Cloud Provider storage services of AWS, Azure, GCP
-and the likes via the [object_store](https://docs.rs/object_store/latest/object_store/)
-crate, enabling network-bandwidth optimizations via predicate and projection push-downs.
-
-*/
+//!
+//! This crate contains the official Native Rust implementation of
+//! [Apache Parquet](https://parquet.apache.org/), part of
+//! the [Apache Arrow](https://arrow.apache.org/) project.
+//! The crate provides a number of APIs to read and write Parquet files,
+//! covering a range of use cases.
+//!
+//! Please see the [parquet crates.io](https://crates.io/crates/parquet)
+//! page for feature flags and tips to improve performance.
+//!
+//! # Format Overview
+//!
+//! Parquet is a columnar format, which means that unlike row formats like [CSV], values are
+//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], with Parquet
+//! focusing on storage efficiency whereas Arrow prioritizes compute efficiency.
+//!
+//! Parquet files are partitioned for scalability. Each file contains metadata,
+//! along with zero or more "row groups", each row group containing one or
+//! more columns. The APIs in this crate reflect this structure.
+//!
+//! Parquet distinguishes between "logical" and "physical" data types.
+//! For instance, strings (logical type) are stored as byte arrays (physical type).
+//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
+//! are stored as integers (physical type). This crate exposes both kinds of types.
+//!
+//! For more details about the Parquet format, see the
+//! [Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format).
+//!
+//! # APIs
+//!
+//! This crate exposes a number of APIs for different use-cases.
+//!
+//! ## Read/Write Arrow
+//!
+//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow `RecordBatch`.
+//! This makes for a simple and performant interface to parquet data, whilst allowing workloads
+//! to leverage the wide range of data transforms provided by the [arrow] crate, and by the
+//! ecosystem of libraries and services using [Arrow] as an interop format.
+//!
+//! ## Read/Write Arrow Async
+//!
+//! When the `async` feature is enabled, [`arrow::async_reader`] and [`arrow::async_writer`]
+//! provide the ability to read and write [`arrow`] data asynchronously. Additionally, with the
+//! `object_store` feature is enabled, [`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader)
+//! provides efficient integration with object storage services such as S3 via the [object_store]
+//! crate, automatically optimizing IO based on any predicates or projections provided.
+//!
+//! ## Read/Write Parquet
+//!
+//! Workloads needing finer-grained control, or looking to not take a dependency on arrow,
+//! can use the lower-level APIs in [`mod@file`]. These APIs expose the underlying parquet
+//! data model, and therefore require knowledge of the underlying parquet format,
+//! including the details of [Dremel] record shredding and [Logical Types]. Most workloads
+//! should prefer the arrow interfaces.
+//!
+//! [arrow]: https://docs.rs/arrow/latest/arrow/index.html
+//! [Arrow]: https://arrow.apache.org/
+//! [CSV]: https://en.wikipedia.org/wiki/Comma-separated_values
+//! [Dremel]: https://research.google/pubs/pub36632/
+//! [Logical Types]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
+//! [object_store]: https://docs.rs/object_store/latest/object_store/
 
 /// Defines a an item with an experimental public API
 ///
diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs
index f98939725517..feaa8055e2dd 100644
--- a/parquet/src/record/reader.rs
+++ b/parquet/src/record/reader.rs
@@ -609,9 +609,20 @@ impl<'a> Either<'a> {
     }
 }
 
-/// Iterator of [`Row`]s.
-/// It is used either for a single row group to iterate over data in that row group, or
-/// an entire file with auto buffering of all row groups.
+/// Access parquet data as an iterator of [`Row`]
+///
+/// # Caveats
+///
+/// Parquet stores data in a columnar fashion using [Dremel] encoding, and is therefore highly
+/// optimised for reading data by column, not row. As a consequence applications concerned with
+/// performance should prefer the columnar arrow or [ColumnReader] APIs.
+///
+/// Additionally the current implementation does not correctly handle repeated fields ([#2394]),
+/// and workloads looking to handle such schema should use the other APIs.
+///
+/// [#2394]: https://github.com/apache/arrow-rs/issues/2394
+/// [ColumnReader]: crate::file::reader::RowGroupReader::get_column_reader
+/// [Dremel]: https://research.google/pubs/pub36632/
 pub struct RowIter<'a> {
     descr: SchemaDescPtr,
     tree_builder: TreeBuilder,

From b7c95abe0c95d24a92934fb77e12c3517be392d1 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <r.taylordavies@googlemail.com>
Date: Sat, 18 Nov 2023 19:57:50 +0000
Subject: [PATCH 6/6] Fix doctest

---
 parquet/src/arrow/async_reader/store.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs
index dce394cdb4d8..293b91aea3ba 100644
--- a/parquet/src/arrow/async_reader/store.rs
+++ b/parquet/src/arrow/async_reader/store.rs
@@ -39,16 +39,16 @@ use crate::file::metadata::ParquetMetaData;
 /// # use parquet::arrow::async_reader::ParquetObjectReader;
 /// # use parquet::arrow::ParquetRecordBatchStreamBuilder;
 /// # use parquet::schema::printer::print_parquet_metadata;
-/// # async fn main() {
+/// # async fn run() {
 /// // Populate configuration from environment
-/// let storage_container = Arc::new(MicrosoftAzureBuilder::from_env().build()?);
+/// let storage_container = Arc::new(MicrosoftAzureBuilder::from_env().build().unwrap());
 /// let location = Path::from("path/to/blob.parquet");
-/// let meta = storage_container.head(&location).await?;
+/// let meta = storage_container.head(&location).await.unwrap();
 /// println!("Found Blob with {}B at {}", meta.size, meta.location);
 ///
 /// // Show Parquet metadata
 /// let reader = ParquetObjectReader::new(storage_container, meta);
-/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
+/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
 /// print_parquet_metadata(&mut stdout(), builder.metadata());
 /// # }
 /// ```