From dbfe60fec46272bd1b61625d961c6636ada3442a Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 16 Aug 2024 13:25:38 +0800 Subject: [PATCH 1/9] Clarify num-nulls handling --- src/main/thrift/parquet.thrift | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 9e83529ac..55729e3dd 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -257,7 +257,11 @@ struct Statistics { */ 1: optional binary max; 2: optional binary min; - /** count of null value in the column */ + /** + * count of null value in the column + * + * Writers should write this field even if it is zero or in non-null columns. + */ 3: optional i64 null_count; /** count of distinct values occurring */ 4: optional i64 distinct_count; From 0ebe97f51d0e88f4665d2bb65072ae0634250cef Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 19 Aug 2024 15:14:36 +0800 Subject: [PATCH 2/9] Update src/main/thrift/parquet.thrift Co-authored-by: Gang Wu --- src/main/thrift/parquet.thrift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 55729e3dd..22f38d6a8 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -260,7 +260,7 @@ struct Statistics { /** * count of null value in the column * - * Writers should write this field even if it is zero or in non-null columns. + * Writers SHOULD always write this field even if it is zero (a.k.a. no null value) */ 3: optional i64 null_count; /** count of distinct values occurring */ From d3f210fdedb1c721023b18af20eab1ff44183c67 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 21 Aug 2024 13:46:51 +0800 Subject: [PATCH 3/9] Add readers behavior --- src/main/thrift/parquet.thrift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 22f38d6a8..3920e217b 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -261,6 +261,9 @@ struct Statistics { * count of null value in the column * * Writers SHOULD always write this field even if it is zero (a.k.a. no null value) + * or is an not nullable column. + * Readers SHOULD distinct null_count == 0 or not having null_count. If null_count + * doesn't exists, Readers cannot gurantees null_count == 0. */ 3: optional i64 null_count; /** count of distinct values occurring */ From 2e0df3a20cfecf9dfb90539b0c52ea15b7a6e84e Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 21 Aug 2024 14:10:37 +0800 Subject: [PATCH 4/9] Apply suggestions from code review Co-authored-by: Ed Seidl --- src/main/thrift/parquet.thrift | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 3920e217b..f6f4bc777 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -258,12 +258,12 @@ struct Statistics { 1: optional binary max; 2: optional binary min; /** - * count of null value in the column + * Count of null values in the column. * - * Writers SHOULD always write this field even if it is zero (a.k.a. no null value) - * or is an not nullable column. - * Readers SHOULD distinct null_count == 0 or not having null_count. If null_count - * doesn't exists, Readers cannot gurantees null_count == 0. + * Writers SHOULD always write this field even if it is zero (i.e. no null value) + * or the column is not nullable. + * Readers SHOULD distinguish between null_count not being present and null_count == 0. + * If null_count is not present, readers SHOULD NOT assume null_count == 0. */ 3: optional i64 null_count; /** count of distinct values occurring */ From fe815eae6fbfd00d68d3593b968ff8a6c27baf5f Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 21 Aug 2024 15:04:03 +0800 Subject: [PATCH 5/9] Reader: change SHOULD to MUST --- src/main/thrift/parquet.thrift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index f6f4bc777..11107f8f3 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -262,7 +262,7 @@ struct Statistics { * * Writers SHOULD always write this field even if it is zero (i.e. no null value) * or the column is not nullable. - * Readers SHOULD distinguish between null_count not being present and null_count == 0. + * Readers MUST distinguish between null_count not being present and null_count == 0. * If null_count is not present, readers SHOULD NOT assume null_count == 0. */ 3: optional i64 null_count; From 1508aa22ac706ca3864ec2aed4e3ac9b9b7215de Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 21 Aug 2024 15:14:26 +0800 Subject: [PATCH 6/9] Add for ColumnIndex --- src/main/thrift/parquet.thrift | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 11107f8f3..c1c917f49 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -263,7 +263,7 @@ struct Statistics { * Writers SHOULD always write this field even if it is zero (i.e. no null value) * or the column is not nullable. * Readers MUST distinguish between null_count not being present and null_count == 0. - * If null_count is not present, readers SHOULD NOT assume null_count == 0. + * If null_count is not present, readers MUST NOT assume null_count == 0. */ 3: optional i64 null_count; /** count of distinct values occurring */ @@ -1091,7 +1091,16 @@ struct ColumnIndex { */ 4: required BoundaryOrder boundary_order - /** A list containing the number of null values for each page **/ + /** + * A list containing the number of null values for each page + * + * Writers SHOULD always write this field even if no null value + * or the column is not nullable. + * Readers MUST distinguish between null_counts not being present + * and null_count is 0. + * If null_counts is not present, readers MUST NOT assume all + * null_count is 0. + */ 5: optional list null_counts /** From 0bf50c5a045ef9cec8f2ccaa6b052e6e8b7ded76 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 21 Aug 2024 21:54:37 +0800 Subject: [PATCH 7/9] Apply suggestions from code review Co-authored-by: Antoine Pitrou --- src/main/thrift/parquet.thrift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index c1c917f49..1a8162c61 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -1097,9 +1097,9 @@ struct ColumnIndex { * Writers SHOULD always write this field even if no null value * or the column is not nullable. * Readers MUST distinguish between null_counts not being present - * and null_count is 0. + * and null_count being 0. * If null_counts is not present, readers MUST NOT assume all - * null_count is 0. + * null counts are 0. */ 5: optional list null_counts From 4dc45022d64d0ab4b955cc1562cc283ae8522da9 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 21 Aug 2024 23:27:12 +0800 Subject: [PATCH 8/9] Update src/main/thrift/parquet.thrift Co-authored-by: Gang Wu --- src/main/thrift/parquet.thrift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 1a8162c61..d2179e2f3 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -1098,7 +1098,7 @@ struct ColumnIndex { * or the column is not nullable. * Readers MUST distinguish between null_counts not being present * and null_count being 0. - * If null_counts is not present, readers MUST NOT assume all + * If null_counts are not present, readers MUST NOT assume all * null counts are 0. */ 5: optional list null_counts From aee6e1e6a129be3514fbdc009ec14e0e61af28be Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 22 Aug 2024 00:19:36 +0800 Subject: [PATCH 9/9] Update src/main/thrift/parquet.thrift Co-authored-by: Ed Seidl --- src/main/thrift/parquet.thrift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index d2179e2f3..83457fe29 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -1094,8 +1094,8 @@ struct ColumnIndex { /** * A list containing the number of null values for each page * - * Writers SHOULD always write this field even if no null value - * or the column is not nullable. + * Writers SHOULD always write this field even if no null values + * are present or the column is not nullable. * Readers MUST distinguish between null_counts not being present * and null_count being 0. * If null_counts are not present, readers MUST NOT assume all