Skip to content

Commit

Permalink
Use compatible column name to set Parquet bloom filter
Browse files Browse the repository at this point in the history
  • Loading branch information
huaxingao committed Dec 16, 2024
1 parent 57ea310 commit d7e6891
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 33 deletions.
74 changes: 50 additions & 24 deletions parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
Expand Down Expand Up @@ -95,6 +96,7 @@
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.ArrayUtil;
import org.apache.iceberg.util.ByteBuffers;
import org.apache.iceberg.util.PropertyUtil;
Expand All @@ -115,8 +117,12 @@
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Parquet {
private static final Logger LOG = LoggerFactory.getLogger(Parquet.class);

private Parquet() {}

private static final Collection<String> READ_PROPERTIES_TO_REMOVE =
Expand Down Expand Up @@ -266,6 +272,43 @@ private WriteBuilder createContextFunc(
return this;
}

private <T> void setBloomFilterConfig(
Context context,
MessageType parquetSchema,
BiConsumer<String, Boolean> withBloomFilterEnabled,
BiConsumer<String, Double> withBloomFilterFPP) {

Map<Integer, String> fieldIdToParquetPath =
parquetSchema.getColumns().stream()
.collect(
Collectors.toMap(
col -> col.getPrimitiveType().getId().intValue(),
col -> String.join(".", col.getPath())));

context
.columnBloomFilterEnabled()
.forEach(
(colPath, isEnabled) -> {
Types.NestedField fieldId = schema.caseInsensitiveFindField(colPath);
if (fieldId == null) {
LOG.warn("Skipping bloom filter config for missing field: {}", colPath);
return;
}

String parquetColumnPath = fieldIdToParquetPath.get(fieldId.fieldId());
if (parquetColumnPath == null) {
LOG.warn("Skipping bloom filter config for missing field: {}", fieldId);
return;
}

withBloomFilterEnabled.accept(parquetColumnPath, Boolean.valueOf(isEnabled));
String fpp = context.columnBloomFilterFpp().get(colPath);
if (fpp != null) {
withBloomFilterFPP.accept(parquetColumnPath, Double.parseDouble(fpp));
}
});
}

public <D> FileAppender<D> build() throws IOException {
Preconditions.checkNotNull(schema, "Schema is required");
Preconditions.checkNotNull(name, "Table name is required and cannot be null");
Expand All @@ -285,8 +328,6 @@ public <D> FileAppender<D> build() throws IOException {
int rowGroupCheckMinRecordCount = context.rowGroupCheckMinRecordCount();
int rowGroupCheckMaxRecordCount = context.rowGroupCheckMaxRecordCount();
int bloomFilterMaxBytes = context.bloomFilterMaxBytes();
Map<String, String> columnBloomFilterFpp = context.columnBloomFilterFpp();
Map<String, String> columnBloomFilterEnabled = context.columnBloomFilterEnabled();
boolean dictionaryEnabled = context.dictionaryEnabled();

if (compressionLevel != null) {
Expand Down Expand Up @@ -343,17 +384,8 @@ public <D> FileAppender<D> build() throws IOException {
.withMaxRowCountForPageSizeCheck(rowGroupCheckMaxRecordCount)
.withMaxBloomFilterBytes(bloomFilterMaxBytes);

for (Map.Entry<String, String> entry : columnBloomFilterEnabled.entrySet()) {
String colPath = entry.getKey();
String bloomEnabled = entry.getValue();
propsBuilder.withBloomFilterEnabled(colPath, Boolean.parseBoolean(bloomEnabled));
}

for (Map.Entry<String, String> entry : columnBloomFilterFpp.entrySet()) {
String colPath = entry.getKey();
String fpp = entry.getValue();
propsBuilder.withBloomFilterFPP(colPath, Double.parseDouble(fpp));
}
setBloomFilterConfig(
context, type, propsBuilder::withBloomFilterEnabled, propsBuilder::withBloomFilterFPP);

ParquetProperties parquetProperties = propsBuilder.build();

Expand Down Expand Up @@ -386,17 +418,11 @@ public <D> FileAppender<D> build() throws IOException {
.withDictionaryPageSize(dictionaryPageSize)
.withEncryption(fileEncryptionProperties);

for (Map.Entry<String, String> entry : columnBloomFilterEnabled.entrySet()) {
String colPath = entry.getKey();
String bloomEnabled = entry.getValue();
parquetWriteBuilder.withBloomFilterEnabled(colPath, Boolean.parseBoolean(bloomEnabled));
}

for (Map.Entry<String, String> entry : columnBloomFilterFpp.entrySet()) {
String colPath = entry.getKey();
String fpp = entry.getValue();
parquetWriteBuilder.withBloomFilterFPP(colPath, Double.parseDouble(fpp));
}
setBloomFilterConfig(
context,
type,
parquetWriteBuilder::withBloomFilterEnabled,
parquetWriteBuilder::withBloomFilterFPP);

return new ParquetWriteAdapter<>(parquetWriteBuilder.build(), metricsConfig);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ public class TestBloomRowGroupFilter {
optional(22, "timestamp", Types.TimestampType.withoutZone()),
optional(23, "timestamptz", Types.TimestampType.withZone()),
optional(24, "binary", Types.BinaryType.get()),
optional(25, "int_decimal", Types.DecimalType.of(8, 2)),
optional(25, "int-decimal", Types.DecimalType.of(8, 2)),
optional(26, "long_decimal", Types.DecimalType.of(14, 2)),
optional(27, "fixed_decimal", Types.DecimalType.of(31, 2)));

Expand Down Expand Up @@ -140,7 +140,7 @@ public class TestBloomRowGroupFilter {
optional(22, "_timestamp", Types.TimestampType.withoutZone()),
optional(23, "_timestamptz", Types.TimestampType.withZone()),
optional(24, "_binary", Types.BinaryType.get()),
optional(25, "_int_decimal", Types.DecimalType.of(8, 2)),
optional(25, "_int-decimal", Types.DecimalType.of(8, 2)),
optional(26, "_long_decimal", Types.DecimalType.of(14, 2)),
optional(27, "_fixed_decimal", Types.DecimalType.of(31, 2)));

Expand Down Expand Up @@ -193,6 +193,7 @@ public void createInputFile() throws IOException {

// build struct field schema
org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(UNDERSCORE_STRUCT_FIELD_TYPE);
String compatibleFieldName = AvroSchemaUtil.makeCompatibleName("_int-decimal");

OutputFile outFile = Files.localOutput(temp);
try (FileAppender<Record> appender =
Expand Down Expand Up @@ -221,7 +222,7 @@ public void createInputFile() throws IOException {
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_timestamp", "true")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_timestamptz", "true")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_binary", "true")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_int_decimal", "true")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_int-decimal", "true")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_long_decimal", "true")
.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_fixed_decimal", "true")
.build()) {
Expand Down Expand Up @@ -256,7 +257,7 @@ public void createInputFile() throws IOException {
builder.set("_timestamp", INSTANT.plusSeconds(i * 86400).toEpochMilli());
builder.set("_timestamptz", INSTANT.plusSeconds(i * 86400).toEpochMilli());
builder.set("_binary", RANDOM_BYTES.get(i));
builder.set("_int_decimal", new BigDecimal(String.valueOf(77.77 + i)));
builder.set(compatibleFieldName, new BigDecimal(String.valueOf(77.77 + i)));
builder.set("_long_decimal", new BigDecimal(String.valueOf(88.88 + i)));
builder.set("_fixed_decimal", new BigDecimal(String.valueOf(99.99 + i)));

Expand Down Expand Up @@ -683,23 +684,23 @@ public void testBytesEq() {
}

@Test
public void testIntDeciamlEq() {
public void testIntDecimalEq() {
for (int i = 0; i < INT_VALUE_COUNT; i++) {
boolean shouldRead =
new ParquetBloomRowGroupFilter(
SCHEMA, equal("int_decimal", new BigDecimal(String.valueOf(77.77 + i))))
SCHEMA, equal("int-decimal", new BigDecimal(String.valueOf(77.77 + i))))
.shouldRead(parquetSchema, rowGroupMetadata, bloomStore);
assertThat(shouldRead).as("Should read: decimal within range").isTrue();
}

boolean shouldRead =
new ParquetBloomRowGroupFilter(SCHEMA, equal("int_decimal", new BigDecimal("1234.56")))
new ParquetBloomRowGroupFilter(SCHEMA, equal("int-decimal", new BigDecimal("1234.56")))
.shouldRead(parquetSchema, rowGroupMetadata, bloomStore);
assertThat(shouldRead).as("Should not read: decimal outside range").isFalse();
}

@Test
public void testLongDeciamlEq() {
public void testLongDecimalEq() {
for (int i = 0; i < INT_VALUE_COUNT; i++) {
boolean shouldRead =
new ParquetBloomRowGroupFilter(
Expand All @@ -715,7 +716,7 @@ SCHEMA, equal("long_decimal", new BigDecimal(String.valueOf(88.88 + i))))
}

@Test
public void testFixedDeciamlEq() {
public void testFixedDecimalEq() {
for (int i = 0; i < INT_VALUE_COUNT; i++) {
boolean shouldRead =
new ParquetBloomRowGroupFilter(
Expand Down

0 comments on commit d7e6891

Please sign in to comment.