From 535cf139e97a11776f559b6f25be26e4d4bc0cf9 Mon Sep 17 00:00:00 2001 From: Pieter Raubenheimer Date: Fri, 12 Apr 2024 17:26:45 +0100 Subject: [PATCH 1/3] File containing a Map schema without explicitly required key --- data/README.md | 32 ++++++++++++++++++++++++++++++++ data/hive-map-schema.parquet | Bin 0 -> 595 bytes 2 files changed, 32 insertions(+) create mode 100644 data/hive-map-schema.parquet diff --git a/data/README.md b/data/README.md index f805c8b..d2819b7 100644 --- a/data/README.md +++ b/data/README.md @@ -50,6 +50,7 @@ | float16_zeros_and_nans.parquet | Float16 (logical type) column with NaNs and zeros as min/max values. . See [note](#float16-files) below | | concatenated_gzip_members.parquet | 513 UINT64 numbers compressed using 2 concatenated gzip members in a single data page | | byte_stream_split.zstd.parquet | Standard normals with `BYTE_STREAM_SPLIT` encoding. See [note](#byte-stream-split) below | +| hive-map-schema.parquet | Contains a Map schema without explicitly required keys, produced by Presto. See [note](#hive-map-schema) | TODO: Document what each file is in the table above. @@ -387,3 +388,34 @@ To check conformance of a `BYTE_STREAM_SPLIT` decoder, read each `BYTE_STREAM_SPLIT`-encoded column and compare the decoded values against the values from the corresponding `PLAIN`-encoded column. The values should be equal. + +## Hive Map Schema + +A number of producers, such as Presto/Trino/Athena, create files with schemas where the Map fields are not explicitly marked as required. An optional key is not possible according to the Parquet spec, but the schema is getting created this way. We can recreate these problematic files for testing https://github.com/apache/arrow-rs/pull/5630. + +Using either Presto CLI, or with AWS Athena: + +```sql +CREATE TABLE my_catalog.my_table_name WITH (format = 'Parquet') AS ( + SELECT MAP ( + ARRAY['name', 'parent'], + ARRAY[ + 'report', + 'another' + ] + ) my_map +) +``` + +The schema in the created file is: + +``` +message hive_schema { + OPTIONAL group my_map (MAP) { + REPEATED group key_value (MAP_KEY_VALUE) { + OPTIONAL BYTE_ARRAY key (STRING); + OPTIONAL BYTE_ARRAY value (STRING); + } + } +} +``` \ No newline at end of file diff --git a/data/hive-map-schema.parquet b/data/hive-map-schema.parquet new file mode 100644 index 0000000000000000000000000000000000000000..62102f0feee84a668902e75fcee49dbd3cadcf92 GIT binary patch literal 595 zcmWG=3^EjD5VaGH5&iJ*_d`}0Q5GPVO_Ys+LB5-V0SuUvlM)gV(vsLF{^7}SSf#L0 z!*qQM>xom|XM9;1?$_^GE)P@&)a4?YB>MjF?x$>o40+?AWvIVjMR{un?+G0*-4ps+ zj0_)FZ2hYQG)Gd9Nm7C(FEKY&Qi81@u_!gKM3hNV3@X8%n3rFYky-?lD@rZMFM`R* zh;fK^F#=UFNXm#!VicVKW&<_GcyMQAmZioQCugMQCWqel=9H!a{m;T6%A?98!3-2rW$=*!c@D&91`R!1w%bUBRwMpjil1doD>Cf^HgJlWD5gJV`B@m)MQhmWaH$t mM6+aLlO*%hG-G2ULt`VOWJ?oGNgZGafdVOyk%1uqn4ADNwTff_ literal 0 HcmV?d00001 From 8d9f3620002181e2790c4db117f309b947765e78 Mon Sep 17 00:00:00 2001 From: Pieter Raubenheimer Date: Tue, 16 Apr 2024 10:45:05 +0100 Subject: [PATCH 2/3] Add links to fixes --- data/README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/data/README.md b/data/README.md index d2819b7..a33683a 100644 --- a/data/README.md +++ b/data/README.md @@ -391,9 +391,15 @@ be equal. ## Hive Map Schema -A number of producers, such as Presto/Trino/Athena, create files with schemas where the Map fields are not explicitly marked as required. An optional key is not possible according to the Parquet spec, but the schema is getting created this way. We can recreate these problematic files for testing https://github.com/apache/arrow-rs/pull/5630. +A number of producers, such as Presto/Trino/Athena, create files with schemas where the Map fields are not explicitly marked as required. An optional key is not possible according to the Parquet spec, but the schema is getting created this way. -Using either Presto CLI, or with AWS Athena: +This issue has been fixed in: +- [Trino v386+](https://github.com/trinodb/trino/commit/3247bd2e64d7422bd13e805cd67cfca3fa8ba520) +- [Presto v0.274+](https://github.com/prestodb/presto/commit/842b46972c11534a7729d0a18e3abc5347922d1a) + +Of course it will take some time for all new files to be produced with these fixes, and the amount of existing data out there remains. + +We can recreate these problematic files for testing [arrow-rs #5630](https://github.com/apache/arrow-rs/pull/5630) with relevant Presto/Trino CLI, or with AWS Athena Console: ```sql CREATE TABLE my_catalog.my_table_name WITH (format = 'Parquet') AS ( From fcc66cdcc4fe9730b1cfb1a377f07c779cdc1f86 Mon Sep 17 00:00:00 2001 From: Pieter Raubenheimer Date: Tue, 16 Apr 2024 15:08:54 +0100 Subject: [PATCH 3/3] Apply suggested changes --- data/README.md | 13 +++++++------ ...schema.parquet => incorrect_map_schema.parquet} | Bin 2 files changed, 7 insertions(+), 6 deletions(-) rename data/{hive-map-schema.parquet => incorrect_map_schema.parquet} (100%) diff --git a/data/README.md b/data/README.md index a33683a..2782a93 100644 --- a/data/README.md +++ b/data/README.md @@ -50,7 +50,7 @@ | float16_zeros_and_nans.parquet | Float16 (logical type) column with NaNs and zeros as min/max values. . See [note](#float16-files) below | | concatenated_gzip_members.parquet | 513 UINT64 numbers compressed using 2 concatenated gzip members in a single data page | | byte_stream_split.zstd.parquet | Standard normals with `BYTE_STREAM_SPLIT` encoding. See [note](#byte-stream-split) below | -| hive-map-schema.parquet | Contains a Map schema without explicitly required keys, produced by Presto. See [note](#hive-map-schema) | +| incorrect_map_schema.parquet | Contains a Map schema without explicitly required keys, produced by Presto. See [note](#incorrect-map-schema) | TODO: Document what each file is in the table above. @@ -389,17 +389,18 @@ To check conformance of a `BYTE_STREAM_SPLIT` decoder, read each the values from the corresponding `PLAIN`-encoded column. The values should be equal. -## Hive Map Schema +## Incorrect Map Schema -A number of producers, such as Presto/Trino/Athena, create files with schemas where the Map fields are not explicitly marked as required. An optional key is not possible according to the Parquet spec, but the schema is getting created this way. +A number of producers, such as Presto/Trino/Athena, have been creating files with schemas +where the Map key fields are marked as optional rather than required. +This is not spec-compliant, yet appears in a number of existing data files in the wild. This issue has been fixed in: - [Trino v386+](https://github.com/trinodb/trino/commit/3247bd2e64d7422bd13e805cd67cfca3fa8ba520) - [Presto v0.274+](https://github.com/prestodb/presto/commit/842b46972c11534a7729d0a18e3abc5347922d1a) -Of course it will take some time for all new files to be produced with these fixes, and the amount of existing data out there remains. - -We can recreate these problematic files for testing [arrow-rs #5630](https://github.com/apache/arrow-rs/pull/5630) with relevant Presto/Trino CLI, or with AWS Athena Console: +We can recreate these problematic files for testing [arrow-rs #5630](https://github.com/apache/arrow-rs/pull/5630) +with relevant Presto/Trino CLI, or with AWS Athena Console: ```sql CREATE TABLE my_catalog.my_table_name WITH (format = 'Parquet') AS ( diff --git a/data/hive-map-schema.parquet b/data/incorrect_map_schema.parquet similarity index 100% rename from data/hive-map-schema.parquet rename to data/incorrect_map_schema.parquet