Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PARQUET-2468: ParquetMetadata must convert to json #1349

Merged
merged 9 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions parquet-hadoop/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,21 @@
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
RustedBones marked this conversation as resolved.
Show resolved Hide resolved
<groupId>${jackson.groupId}</groupId>
<artifactId>jackson-annotations</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>${jackson.groupId}</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson-databind.version}</version>
</dependency>
<dependency>
<groupId>${jackson.datatype.groupId}</groupId>
<artifactId>jackson-datatype-jdk8</artifactId>
<version>${jackson-modules-java8.version}</version>
</dependency>
<dependency>
<groupId>org.xerial.snappy</groupId>
<artifactId>snappy-java</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import static org.apache.parquet.column.Encoding.RLE_DICTIONARY;
import static org.apache.parquet.format.Util.readColumnMetaData;

import com.fasterxml.jackson.annotation.JsonIgnore;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Set;
Expand Down Expand Up @@ -338,6 +339,7 @@ public ColumnPath getPath() {
* @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead.
*/
@Deprecated
@JsonIgnore
public PrimitiveTypeName getType() {
decryptIfNeeded();
return properties.getType();
Expand Down Expand Up @@ -380,13 +382,15 @@ public PrimitiveType getPrimitiveType() {
/**
* @return the stats for this column
*/
@JsonIgnore
Fokko marked this conversation as resolved.
Show resolved Hide resolved
public abstract Statistics getStatistics();

/**
* Method should be considered private
*
* @return the size stats for this column
*/
@JsonIgnore
public SizeStatistics getSizeStatistics() {
throw new UnsupportedOperationException("SizeStatistics is not implemented");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*/
package org.apache.parquet.hadoop.metadata;

import com.fasterxml.jackson.annotation.JsonIgnore;
import java.util.Arrays;
import java.util.Set;
import org.apache.parquet.column.Encoding;
Expand Down Expand Up @@ -76,6 +77,7 @@ public ColumnPath getPath() {
* @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead.
*/
@Deprecated
@JsonIgnore
public PrimitiveTypeName getType() {
return type.getPrimitiveTypeName();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import static java.util.Collections.unmodifiableMap;

import com.fasterxml.jackson.annotation.JsonIgnore;
import java.io.Serializable;
import java.util.Map;
import java.util.Objects;
Expand Down Expand Up @@ -109,6 +110,7 @@ public String getCreatedBy() {
return createdBy;
}

@JsonIgnore
public InternalFileDecryptor getFileDecryptor() {
return fileDecryptor;
}
Comment on lines +113 to 116
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When logging the file metadata, we probably want to skip this

  "fileDecryptor" : {
    "fileAAD" : null,
    "decryptionProperties" : {
      "footerKey" : "AQIDBAUGBwgJCgsMDQ4PEA==",
      "keyRetriever" : null,
      "aadprefix" : null
    }
  },

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
package org.apache.parquet.hadoop.metadata;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jdk8.Jdk8Module;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
Expand All @@ -32,6 +35,14 @@ public class ParquetMetadata {

private static final ObjectMapper objectMapper = new ObjectMapper();

static {
// Enable FAIL_ON_EMPTY_BEANS on objectmapper. Without this feature parquet-casdacing tests fail,
// because LogicalTypeAnnotation implementations are classes without any property.
objectMapper.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS);
// Add support for Java 8 Optional
objectMapper.registerModule(new Jdk8Module());
RustedBones marked this conversation as resolved.
Show resolved Hide resolved
}

/**
* @param parquetMetaData an instance of parquet metadata to convert
* @return the json representation
Expand All @@ -50,19 +61,23 @@ public static String toPrettyJSON(ParquetMetadata parquetMetaData) {

private static String toJSON(ParquetMetadata parquetMetaData, boolean isPrettyPrint) {
try (StringWriter stringWriter = new StringWriter()) {
Object objectToPrint;
if (parquetMetaData.getFileMetaData() == null
|| parquetMetaData.getFileMetaData().getEncryptionType()
== FileMetaData.EncryptionType.UNENCRYPTED) {
objectToPrint = parquetMetaData;
} else {
objectToPrint = parquetMetaData.getFileMetaData();
}

ObjectWriter writer;
if (isPrettyPrint) {
Object objectToPrint;
if (parquetMetaData.getFileMetaData() == null
|| parquetMetaData.getFileMetaData().getEncryptionType()
== FileMetaData.EncryptionType.UNENCRYPTED) {
objectToPrint = parquetMetaData;
} else {
objectToPrint = parquetMetaData.getFileMetaData();
}
objectMapper.writerWithDefaultPrettyPrinter().writeValue(stringWriter, objectToPrint);
writer = objectMapper.writerWithDefaultPrettyPrinter();
} else {
objectMapper.writeValue(stringWriter, parquetMetaData);
Fokko marked this conversation as resolved.
Show resolved Hide resolved
writer = objectMapper.writer();
}

writer.writeValue(stringWriter, objectToPrint);
return stringWriter.toString();
} catch (IOException e) {
throw new RuntimeException(e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@
import org.apache.parquet.column.statistics.LongStatistics;
import org.apache.parquet.column.statistics.SizeStatistics;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.crypto.DecryptionPropertiesFactory;
import org.apache.parquet.crypto.EncryptionPropertiesFactory;
import org.apache.parquet.crypto.FileDecryptionProperties;
import org.apache.parquet.crypto.InternalFileDecryptor;
import org.apache.parquet.example.Paper;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroup;
Expand Down Expand Up @@ -635,18 +639,49 @@ public void randomTestFilterMetaData() {
}

@Test
public void testNullFieldMetadataDebugLogging() {
public void testFieldMetadataDebugLogging() {
MessageType schema = parseMessageType("message test { optional binary some_null_field; }");
org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData =
new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
new org.apache.parquet.hadoop.metadata.FileMetaData(
schema,
new HashMap<>(),
null,
org.apache.parquet.hadoop.metadata.FileMetaData.EncryptionType.UNENCRYPTED,
null);
List<BlockMetaData> blockMetaDataList = new ArrayList<>();
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.addColumn(createColumnChunkMetaData());
blockMetaDataList.add(blockMetaData);
ParquetMetadata metadata = new ParquetMetadata(fileMetaData, blockMetaDataList);
ParquetMetadata.toJSON(metadata);
}

@Test
public void testEncryptedFieldMetadataDebugLogging() {
Configuration conf = new Configuration();
conf.set(
EncryptionPropertiesFactory.CRYPTO_FACTORY_CLASS_PROPERTY_NAME,
"org.apache.parquet.crypto.SampleDecryptionPropertiesFactory");
DecryptionPropertiesFactory decryptionPropertiesFactory = DecryptionPropertiesFactory.loadFactory(conf);
FileDecryptionProperties decryptionProperties =
decryptionPropertiesFactory.getFileDecryptionProperties(conf, null);

MessageType schema = parseMessageType("message test { optional binary some_null_field; }");

org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData =
new org.apache.parquet.hadoop.metadata.FileMetaData(
schema,
new HashMap<>(),
null,
org.apache.parquet.hadoop.metadata.FileMetaData.EncryptionType.ENCRYPTED_FOOTER,
new InternalFileDecryptor(decryptionProperties));

List<BlockMetaData> blockMetaDataList = new ArrayList<>();
ParquetMetadata metadata = new ParquetMetadata(fileMetaData, blockMetaDataList);
ParquetMetadata.toJSON(metadata);
System.out.println(ParquetMetadata.toPrettyJSON(metadata));
}

@Test
public void testMetadataToJson() {
ParquetMetadata metadata = new ParquetMetadata(null, null);
Expand Down
18 changes: 18 additions & 0 deletions parquet-jackson/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,22 @@
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>${jackson.groupId}</groupId>
<artifactId>jackson-annotations</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson-databind.version}</version>
</dependency>
<!-- Add support for Java 8 Optional -->
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jdk8</artifactId>
<version>${jackson-modules-java8.version}</version>
</dependency>
</dependencies>

<properties>
Expand Down Expand Up @@ -70,6 +81,7 @@
<artifactSet>
<includes>
<include>${jackson.groupId}:*</include>
<include>${jackson.datatype.groupId}:*</include>
</includes>
</artifactSet>
<filters>
Expand All @@ -79,6 +91,12 @@
<include>**</include>
</includes>
</filter>
<filter>
<artifact>${jackson.datatype.groupId}:*</artifact>
<includes>
<include>**</include>
</includes>
</filter>
</filters>
<relocations>
<relocation>
Expand Down
3 changes: 2 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,9 @@
<jackson.groupId>com.fasterxml.jackson.core</jackson.groupId>
<jackson.datatype.groupId>com.fasterxml.jackson.datatype</jackson.datatype.groupId>
<jackson.package>com.fasterxml.jackson</jackson.package>
<jackson.version>2.17.0</jackson.version>
<jackson.version>2.17.1</jackson.version>
<jackson-databind.version>2.17.1</jackson-databind.version>
<jackson-modules-java8.version>2.17.1</jackson-modules-java8.version>
<japicmp.version>0.21.0</japicmp.version>
<javax.annotation.version>1.3.2</javax.annotation.version>
<spotless.version>2.30.0</spotless.version>
Expand Down