From 747dcbf0670aeab2ede474edb3c4f22028d6a7e6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 7 Nov 2023 21:16:15 +0000 Subject: [PATCH] Update parquet encoding docs (#5053) * Update parquet encoding docs * Review feedback --- parquet/README.md | 2 +- parquet/src/basic.rs | 29 ++++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/parquet/README.md b/parquet/README.md index 86c7ee2c35d0..2e0ab1d52c30 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -55,7 +55,7 @@ The `parquet` crate provides the following features which may be enabled in your ## Parquet Feature Status -- [x] All encodings supported +- [x] All encodings supported, except for BYTE_STREAM_SPLIT ([#4102](https://github.com/apache/arrow-rs/issues/4102)) - [x] All compression codecs supported - [x] Read support - [x] Primitive column value readers diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index ab71aa44169b..3c8602b8022b 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -215,8 +215,21 @@ pub enum Repetition { // Mirrors `parquet::Encoding` /// Encodings supported by Parquet. +/// /// Not all encodings are valid for all types. These enums are also used to specify the /// encoding of definition and repetition levels. +/// +/// By default this crate uses [Encoding::PLAIN], [Encoding::RLE], and [Encoding::RLE_DICTIONARY]. +/// These provide very good encode and decode performance, whilst yielding reasonable storage +/// efficiency and being supported by all major parquet readers. +/// +/// The delta encodings are also supported and will be used if a newer [WriterVersion] is +/// configured, however, it should be noted that these sacrifice encode and decode performance for +/// improved storage efficiency. This performance regression is particularly pronounced in the case +/// of record skipping as occurs during predicate push-down. It is recommended users assess the +/// performance impact when evaluating these encodings. +/// +/// [WriterVersion]: crate::file::properties::WriterVersion #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] #[allow(non_camel_case_types)] pub enum Encoding { @@ -303,7 +316,21 @@ impl FromStr for Encoding { // ---------------------------------------------------------------------- // Mirrors `parquet::CompressionCodec` -/// Supported compression algorithms. +/// Supported block compression algorithms. +/// +/// Block compression can yield non-trivial improvements to storage efficiency at the expense +/// of potentially significantly worse encode and decode performance. Many applications, +/// especially those making use of high-throughput and low-cost commodity object storage, +/// may find storage efficiency less important than decode throughput, and therefore may +/// wish to not make use of block compression. +/// +/// The writers in this crate default to no block compression for this reason. +/// +/// Applications that do still wish to use block compression, will find [`Compression::ZSTD`] +/// to provide a good balance of compression, performance, and ecosystem support. Alternatively, +/// [`Compression::LZ4_RAW`] provides much faster decompression speeds, at the cost of typically +/// worse compression ratios. However, it is not as widely supported by the ecosystem, with the +/// Hadoop ecosystem historically favoring the non-standard and now deprecated [`Compression::LZ4`]. #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[allow(non_camel_case_types)] pub enum Compression {