From 747dcbf0670aeab2ede474edb3c4f22028d6a7e6 Mon Sep 17 00:00:00 2001
From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
Date: Tue, 7 Nov 2023 21:16:15 +0000
Subject: [PATCH] Update parquet encoding docs (#5053)

* Update parquet encoding docs

* Review feedback
---
 parquet/README.md    |  2 +-
 parquet/src/basic.rs | 29 ++++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/parquet/README.md b/parquet/README.md
index 86c7ee2c35d0..2e0ab1d52c30 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -55,7 +55,7 @@ The `parquet` crate provides the following features which may be enabled in your
 
 ## Parquet Feature Status
 
-- [x] All encodings supported
+- [x] All encodings supported, except for BYTE_STREAM_SPLIT ([#4102](https://github.com/apache/arrow-rs/issues/4102))
 - [x] All compression codecs supported
 - [x] Read support
   - [x] Primitive column value readers
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index ab71aa44169b..3c8602b8022b 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -215,8 +215,21 @@ pub enum Repetition {
 // Mirrors `parquet::Encoding`
 
 /// Encodings supported by Parquet.
+///
 /// Not all encodings are valid for all types. These enums are also used to specify the
 /// encoding of definition and repetition levels.
+///
+/// By default this crate uses [Encoding::PLAIN], [Encoding::RLE], and [Encoding::RLE_DICTIONARY].
+/// These provide very good encode and decode performance, whilst yielding reasonable storage
+/// efficiency and being supported by all major parquet readers.
+///
+/// The delta encodings are also supported and will be used if a newer [WriterVersion] is
+/// configured, however, it should be noted that these sacrifice encode and decode performance for
+/// improved storage efficiency. This performance regression is particularly pronounced in the case
+/// of record skipping as occurs during predicate push-down. It is recommended users assess the
+/// performance impact when evaluating these encodings.
+///
+/// [WriterVersion]: crate::file::properties::WriterVersion
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)]
 #[allow(non_camel_case_types)]
 pub enum Encoding {
@@ -303,7 +316,21 @@ impl FromStr for Encoding {
 // ----------------------------------------------------------------------
 // Mirrors `parquet::CompressionCodec`
 
-/// Supported compression algorithms.
+/// Supported block compression algorithms.
+///
+/// Block compression can yield non-trivial improvements to storage efficiency at the expense
+/// of potentially significantly worse encode and decode performance. Many applications,
+/// especially those making use of high-throughput and low-cost commodity object storage,
+/// may find storage efficiency less important than decode throughput, and therefore may
+/// wish to not make use of block compression.
+///
+/// The writers in this crate default to no block compression for this reason.
+///
+/// Applications that do still wish to use block compression, will find [`Compression::ZSTD`]
+/// to provide a good balance of compression, performance, and ecosystem support. Alternatively,
+/// [`Compression::LZ4_RAW`] provides much faster decompression speeds, at the cost of typically
+/// worse compression ratios. However, it is not as widely supported by the ecosystem, with the
+/// Hadoop ecosystem historically favoring the non-standard and now deprecated [`Compression::LZ4`].
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 #[allow(non_camel_case_types)]
 pub enum Compression {