From bcf5abcbc9d8dbfe5110ef2cfc095e04a7e5ad46 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 28 Nov 2024 22:56:30 +0100 Subject: [PATCH] GH-3086: Allow for empty beans (#3087) --- .../converter/ParquetMetadataConverter.java | 4 +- .../hadoop/metadata/ParquetMetadata.java | 6 +- .../hadoop/metadata/TestParquetMetadata.java | 113 ++++ .../test-expected-parquet-metadata.json | 500 ++++++++++++++++++ 4 files changed, 620 insertions(+), 3 deletions(-) create mode 100644 parquet-hadoop/src/test/java/org/apache/parquet/hadoop/metadata/TestParquetMetadata.java create mode 100644 parquet-hadoop/src/test/resources/test-expected-parquet-metadata.json diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index bb9e58b7ad..d1c6b01c93 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -1591,7 +1591,9 @@ public FileMetaDataAndRowGroupOffsetInfo visit(RangeMetadataFilter filter) throw ParquetMetadata parquetMetadata = fromParquetMetadata(fileMetaData, fileDecryptor, encryptedFooter, rowGroupToRowIndexOffsetMap); - if (LOG.isDebugEnabled()) LOG.debug(ParquetMetadata.toPrettyJSON(parquetMetadata)); + if (LOG.isDebugEnabled()) { + LOG.debug(ParquetMetadata.toPrettyJSON(parquetMetadata)); + } return parquetMetadata; } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ParquetMetadata.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ParquetMetadata.java index 1709303a95..4f6b1cc5e8 100755 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ParquetMetadata.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ParquetMetadata.java @@ -20,6 +20,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; +import com.fasterxml.jackson.databind.SerializationFeature; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; @@ -27,11 +28,12 @@ /** * Metadata block stored in the footer of the file - * contains file level (Codec, Schema, ...) and block level (location, columns, record count, ...) meta data + * contains file level (Codec, Schema, ...) and block level (location, columns, record count, ...) metadata */ public class ParquetMetadata { - private static final ObjectMapper objectMapper = new ObjectMapper(); + private static final ObjectMapper objectMapper = + new ObjectMapper().configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false); /** * @param parquetMetaData an instance of parquet metadata to convert diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/metadata/TestParquetMetadata.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/metadata/TestParquetMetadata.java new file mode 100644 index 0000000000..7ae3b16313 --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/metadata/TestParquetMetadata.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop.metadata; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.NANOS; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.junit.Assert.assertEquals; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Collections; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; +import org.junit.Test; + +public class TestParquetMetadata { + private static final String EXPECTED_JSON = "/test-expected-parquet-metadata.json"; + + // Use an object mapper, since order of the keys might differ in JSON maps, + // and we don't want to sort them because of testing + private static final ObjectMapper mapper = new ObjectMapper(); + + private static JsonNode expectedJson() throws IOException, URISyntaxException { + URI path = TestParquetMetadata.class.getResource(EXPECTED_JSON).toURI(); + return mapper.readTree(path.toURL()); + } + + @Test + public void testToPrettyJSON() throws IOException, URISyntaxException { + MessageType complexParquetSchema = Types.buildMessage() + .addField(Types.optional(INT32) + .as(LogicalTypeAnnotation.intType(8)) + .named("a")) + .addField(Types.optionalGroup() + .addField(Types.optional(INT32) + .as(LogicalTypeAnnotation.intType(16)) + .named("c")) + .addField(Types.optional(BINARY) + .as(LogicalTypeAnnotation.stringType()) + .named("d")) + .named("b")) + .addField(Types.optionalList() + .setElementType(Types.optional(INT32) + .as(LogicalTypeAnnotation.dateType()) + .named("element")) + .named("e")) + .addField(Types.optionalList() + .setElementType(Types.optional(INT32) + .as(LogicalTypeAnnotation.dateType()) + .named("element")) + .named("f")) + .addField(Types.optional(FLOAT).named("g")) + .addField(Types.optional(INT64) + .as(LogicalTypeAnnotation.timestampType(true, MILLIS)) + .named("h")) + .addField(Types.optional(INT64) + .as(LogicalTypeAnnotation.timestampType(true, NANOS)) + .named("i")) + .addField(Types.optional(INT64) + .as(LogicalTypeAnnotation.timestampType(false, MILLIS)) + .named("j")) + .addField(Types.optional(INT64) + .as(LogicalTypeAnnotation.timestampType(true, MICROS)) + .named("k")) + .addField(Types.optional(INT64) + .as(LogicalTypeAnnotation.timestampType(false, MICROS)) + .named("l")) + .addField(Types.optional(FIXED_LEN_BYTE_ARRAY) + .length(12) + .as(LogicalTypeAnnotation.intervalType()) + .named("m")) + .addField(Types.optionalMap() + .key(Types.optional(INT32) + .as(LogicalTypeAnnotation.dateType()) + .named("key")) + .value(Types.optional(BINARY) + .as(LogicalTypeAnnotation.stringType()) + .named("value")) + .named("list")) + .named("root"); + + FileMetaData fmd = new FileMetaData(complexParquetSchema, Collections.emptyMap(), "ASF"); + String prettyJSon = ParquetMetadata.toPrettyJSON(new ParquetMetadata(fmd, Collections.emptyList())); + + assertEquals(mapper.readTree(prettyJSon), expectedJson()); + } +} diff --git a/parquet-hadoop/src/test/resources/test-expected-parquet-metadata.json b/parquet-hadoop/src/test/resources/test-expected-parquet-metadata.json new file mode 100644 index 0000000000..fad63f4acc --- /dev/null +++ b/parquet-hadoop/src/test/resources/test-expected-parquet-metadata.json @@ -0,0 +1,500 @@ +{ + "schema" : { + "name" : "root", + "repetition" : "REPEATED", + "logicalTypeAnnotation" : null, + "id" : null, + "fields" : [ { + "name" : "a", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "bitWidth" : 8, + "signed" : true + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT32", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "INT_8" + }, { + "name" : "b", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : null, + "id" : null, + "fields" : [ { + "name" : "c", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "bitWidth" : 16, + "signed" : true + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT32", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "INT_16" + }, { + "name" : "d", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "BINARY", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "UTF8" + } ], + "primitive" : false, + "fieldCount" : 2, + "originalType" : null + }, { + "name" : "e", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "fields" : [ { + "name" : "list", + "repetition" : "REPEATED", + "logicalTypeAnnotation" : null, + "id" : null, + "fields" : [ { + "name" : "element", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT32", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "DATE" + } ], + "primitive" : false, + "fieldCount" : 1, + "originalType" : null + } ], + "primitive" : false, + "fieldCount" : 1, + "originalType" : "LIST" + }, { + "name" : "f", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "fields" : [ { + "name" : "list", + "repetition" : "REPEATED", + "logicalTypeAnnotation" : null, + "id" : null, + "fields" : [ { + "name" : "element", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT32", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "DATE" + } ], + "primitive" : false, + "fieldCount" : 1, + "originalType" : null + } ], + "primitive" : false, + "fieldCount" : 1, + "originalType" : "LIST" + }, { + "name" : "g", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : null, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "FLOAT", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : null + }, { + "name" : "h", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "unit" : "MILLIS", + "adjustedToUTC" : true + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT64", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "TIMESTAMP_MILLIS" + }, { + "name" : "i", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "unit" : "NANOS", + "adjustedToUTC" : true + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT64", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : null + }, { + "name" : "j", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "unit" : "MILLIS", + "adjustedToUTC" : false + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT64", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "TIMESTAMP_MILLIS" + }, { + "name" : "k", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "unit" : "MICROS", + "adjustedToUTC" : true + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT64", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "TIMESTAMP_MICROS" + }, { + "name" : "l", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "unit" : "MICROS", + "adjustedToUTC" : false + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT64", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "TIMESTAMP_MICROS" + }, { + "name" : "m", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "FIXED_LEN_BYTE_ARRAY", + "typeLength" : 12, + "decimalMetadata" : null, + "originalType" : "INTERVAL" + }, { + "name" : "list", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "fields" : [ { + "name" : "key_value", + "repetition" : "REPEATED", + "logicalTypeAnnotation" : null, + "id" : null, + "fields" : [ { + "name" : "key", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT32", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "DATE" + }, { + "name" : "value", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "BINARY", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "UTF8" + } ], + "primitive" : false, + "fieldCount" : 2, + "originalType" : null + } ], + "primitive" : false, + "fieldCount" : 1, + "originalType" : "MAP" + } ], + "paths" : [ [ "a" ], [ "b", "c" ], [ "b", "d" ], [ "e", "list", "element" ], [ "f", "list", "element" ], [ "g" ], [ "h" ], [ "i" ], [ "j" ], [ "k" ], [ "l" ], [ "m" ], [ "list", "key_value", "key" ], [ "list", "key_value", "value" ] ], + "columns" : [ { + "path" : [ "a" ], + "type" : "INT32", + "typeLength" : 0, + "maxRepetitionLevel" : 0, + "maxDefinitionLevel" : 1, + "primitiveType" : { + "name" : "a", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "bitWidth" : 8, + "signed" : true + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT32", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "INT_8" + } + }, { + "path" : [ "b", "c" ], + "type" : "INT32", + "typeLength" : 0, + "maxRepetitionLevel" : 0, + "maxDefinitionLevel" : 2, + "primitiveType" : { + "name" : "c", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "bitWidth" : 16, + "signed" : true + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT32", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "INT_16" + } + }, { + "path" : [ "b", "d" ], + "type" : "BINARY", + "typeLength" : 0, + "maxRepetitionLevel" : 0, + "maxDefinitionLevel" : 2, + "primitiveType" : { + "name" : "d", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "BINARY", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "UTF8" + } + }, { + "path" : [ "e", "list", "element" ], + "type" : "INT32", + "typeLength" : 0, + "maxRepetitionLevel" : 1, + "maxDefinitionLevel" : 3, + "primitiveType" : { + "name" : "element", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT32", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "DATE" + } + }, { + "path" : [ "f", "list", "element" ], + "type" : "INT32", + "typeLength" : 0, + "maxRepetitionLevel" : 1, + "maxDefinitionLevel" : 3, + "primitiveType" : { + "name" : "element", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT32", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "DATE" + } + }, { + "path" : [ "g" ], + "type" : "FLOAT", + "typeLength" : 0, + "maxRepetitionLevel" : 0, + "maxDefinitionLevel" : 1, + "primitiveType" : { + "name" : "g", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : null, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "FLOAT", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : null + } + }, { + "path" : [ "h" ], + "type" : "INT64", + "typeLength" : 0, + "maxRepetitionLevel" : 0, + "maxDefinitionLevel" : 1, + "primitiveType" : { + "name" : "h", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "unit" : "MILLIS", + "adjustedToUTC" : true + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT64", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "TIMESTAMP_MILLIS" + } + }, { + "path" : [ "i" ], + "type" : "INT64", + "typeLength" : 0, + "maxRepetitionLevel" : 0, + "maxDefinitionLevel" : 1, + "primitiveType" : { + "name" : "i", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "unit" : "NANOS", + "adjustedToUTC" : true + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT64", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : null + } + }, { + "path" : [ "j" ], + "type" : "INT64", + "typeLength" : 0, + "maxRepetitionLevel" : 0, + "maxDefinitionLevel" : 1, + "primitiveType" : { + "name" : "j", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "unit" : "MILLIS", + "adjustedToUTC" : false + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT64", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "TIMESTAMP_MILLIS" + } + }, { + "path" : [ "k" ], + "type" : "INT64", + "typeLength" : 0, + "maxRepetitionLevel" : 0, + "maxDefinitionLevel" : 1, + "primitiveType" : { + "name" : "k", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "unit" : "MICROS", + "adjustedToUTC" : true + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT64", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "TIMESTAMP_MICROS" + } + }, { + "path" : [ "l" ], + "type" : "INT64", + "typeLength" : 0, + "maxRepetitionLevel" : 0, + "maxDefinitionLevel" : 1, + "primitiveType" : { + "name" : "l", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { + "unit" : "MICROS", + "adjustedToUTC" : false + }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT64", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "TIMESTAMP_MICROS" + } + }, { + "path" : [ "m" ], + "type" : "FIXED_LEN_BYTE_ARRAY", + "typeLength" : 12, + "maxRepetitionLevel" : 0, + "maxDefinitionLevel" : 1, + "primitiveType" : { + "name" : "m", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "FIXED_LEN_BYTE_ARRAY", + "typeLength" : 12, + "decimalMetadata" : null, + "originalType" : "INTERVAL" + } + }, { + "path" : [ "list", "key_value", "key" ], + "type" : "INT32", + "typeLength" : 0, + "maxRepetitionLevel" : 1, + "maxDefinitionLevel" : 3, + "primitiveType" : { + "name" : "key", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "INT32", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "DATE" + } + }, { + "path" : [ "list", "key_value", "value" ], + "type" : "BINARY", + "typeLength" : 0, + "maxRepetitionLevel" : 1, + "maxDefinitionLevel" : 3, + "primitiveType" : { + "name" : "value", + "repetition" : "OPTIONAL", + "logicalTypeAnnotation" : { }, + "id" : null, + "primitive" : true, + "primitiveTypeName" : "BINARY", + "typeLength" : 0, + "decimalMetadata" : null, + "originalType" : "UTF8" + } + } ], + "primitive" : false, + "fieldCount" : 12, + "originalType" : null + }, + "keyValueMetaData" : { }, + "createdBy" : "ASF", + "encryptionType" : null +}