From e989f51e44bc838632ba4d0e031ad6adcec4b09a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 18 Jan 2024 10:38:26 +0100 Subject: [PATCH 1/2] PARQUET-2414: Extend BYTE_STREAM_SPLIT to support INT32, INT64 and FIXED_LEN_BYTE_ARRAY data --- CHANGES.md | 6 ++++++ Encodings.md | 5 +++-- src/main/thrift/parquet.thrift | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 400200040..7bbce7c4e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -19,6 +19,12 @@ # Parquet # +### Version 2.11.0 ### + +#### New Feature + +* [PARQUET-2414](https://issues.apache.org/jira/browse/PARQUET-2414) - Extend BYTE_STREAM_SPLIT to support INT32, INT64 and FIXED_LEN_BYTE_ARRAY data + ### Version 2.10.0 ### #### New Feature diff --git a/Encodings.md b/Encodings.md index aaf7a362f..954c32fae 100644 --- a/Encodings.md +++ b/Encodings.md @@ -335,14 +335,15 @@ Note that, even for FIXED_LEN_BYTE_ARRAY, all lengths are encoded despite the re ### Byte Stream Split: (BYTE_STREAM_SPLIT = 9) -Supported Types: FLOAT, DOUBLE +Supported Types: INT32, INT64, FLOAT, DOUBLE, FIXED_LEN_BYTE_ARRAY This encoding does not reduce the size of the data but can lead to a significantly better compression ratio and speed when a compression algorithm is used afterwards. This encoding creates K byte-streams of length N where K is the size in bytes of the data -type and N is the number of elements in the data sequence. Specifically, K is 4 for FLOAT +type and N is the number of elements in the data sequence. For example, K is 4 for FLOAT type and 8 for DOUBLE type. + The bytes of each value are scattered to the corresponding streams. The 0-th byte goes to the 0-th stream, the 1-st byte goes to the 1-st stream and so on. The streams are concatenated in the following order: 0-th stream, 1-st stream, etc. diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 2084ac63c..ce4f35dd0 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -526,9 +526,9 @@ enum Encoding { */ RLE_DICTIONARY = 8; - /** Encoding for floating-point data. + /** Encoding for fixed-width data (INT32, INT64, FLOAT, DOUBLE, FIXED_LEN_BYTE_ARRAY). K byte-streams are created where K is the size in bytes of the data type. - The individual bytes of an FP value are scattered to the corresponding stream and + The individual bytes of a value are scattered to the corresponding stream and the streams are concatenated. This itself does not reduce the size of the data but can lead to better compression afterwards. From 88d3cbd77da5d1afc57291b1709146ad0167ccb0 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 29 Jan 2024 16:14:28 +0100 Subject: [PATCH 2/2] Address review comments --- Encodings.md | 2 +- src/main/thrift/parquet.thrift | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Encodings.md b/Encodings.md index 954c32fae..51c6d8060 100644 --- a/Encodings.md +++ b/Encodings.md @@ -335,7 +335,7 @@ Note that, even for FIXED_LEN_BYTE_ARRAY, all lengths are encoded despite the re ### Byte Stream Split: (BYTE_STREAM_SPLIT = 9) -Supported Types: INT32, INT64, FLOAT, DOUBLE, FIXED_LEN_BYTE_ARRAY +Supported Types: FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY This encoding does not reduce the size of the data but can lead to a significantly better compression ratio and speed when a compression algorithm is used afterwards. diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index ce4f35dd0..27d404374 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -526,12 +526,15 @@ enum Encoding { */ RLE_DICTIONARY = 8; - /** Encoding for fixed-width data (INT32, INT64, FLOAT, DOUBLE, FIXED_LEN_BYTE_ARRAY). + /** Encoding for fixed-width data (FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY). K byte-streams are created where K is the size in bytes of the data type. The individual bytes of a value are scattered to the corresponding stream and the streams are concatenated. This itself does not reduce the size of the data but can lead to better compression afterwards. + + Added in 2.8 for FLOAT and DOUBLE. + Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11. */ BYTE_STREAM_SPLIT = 9; }