From b9c22c1d3436a0b7d411756496da5b02555654af Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 13 Nov 2024 10:57:10 +0800 Subject: [PATCH] add per column config --- parquet-hadoop/README.md | 9 +++++++++ .../parquet/hadoop/ParquetOutputFormat.java | 16 ++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/parquet-hadoop/README.md b/parquet-hadoop/README.md index c00fea2439..5817069f4b 100644 --- a/parquet-hadoop/README.md +++ b/parquet-hadoop/README.md @@ -515,4 +515,13 @@ if not found then the library will use the classic non-vectored reads: it is saf **Description:** Whether to enable column statistics collection. If `true`, statistics will be collected for all columns unless explicitly disabled for specific columns. If `false`, statistics will be disabled for all columns regardless of column-specific settings. +It is possible to enable or disable statistics for specific columns by appending `#` followed by the column path. **Default value:** `true` +**Example:** +```java +// Enable statistics for all columns +conf.set("parquet.column.statistics.enabled", true); +// Disable statistics for 'column.path' +conf.set("parquet.column.statistics.enabled#column.path", false); +// The final configuration will be: Enable statistics for all columns except 'column.path' +``` diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java index b16d743a98..c13781a685 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java @@ -397,6 +397,18 @@ public static boolean getStatisticsEnabled(Configuration conf) { return conf.getBoolean(STATISTICS_ENABLED, ParquetProperties.DEFAULT_STATISTICS_ENABLED); } + public static void setStatisticsEnabled(JobContext jobContext, String columnPath, boolean enabled) { + getConfiguration(jobContext).set(STATISTICS_ENABLED + "#" + columnPath, String.valueOf(enabled)); + } + + public static boolean getStatisticsEnabled(Configuration conf, String columnPath) { + String columnSpecific = conf.get(STATISTICS_ENABLED + "#" + columnPath); + if (columnSpecific != null) { + return Boolean.parseBoolean(columnSpecific); + } + return conf.getBoolean(STATISTICS_ENABLED, ParquetProperties.DEFAULT_STATISTICS_ENABLED); + } + private WriteSupport writeSupport; private ParquetOutputCommitter committer; @@ -489,6 +501,10 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp BLOOM_FILTER_CANDIDATES_NUMBER, key -> conf.getInt(key, ParquetProperties.DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER), propsBuilder::withBloomFilterCandidatesNumber) + .withColumnConfig( + STATISTICS_ENABLED, + key -> conf.getBoolean(key, ParquetProperties.DEFAULT_STATISTICS_ENABLED), + propsBuilder::withStatisticsEnabled) .parseConfig(conf); ParquetProperties props = propsBuilder.build();