Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[608] Ensure column stats for decimal fields have proper scale set #617

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
package org.apache.xtable.delta;

import java.math.BigDecimal;
import java.math.BigInteger;
import java.math.MathContext;
import java.math.RoundingMode;
import java.nio.charset.StandardCharsets;
import java.sql.Date;
import java.text.DateFormat;
Expand Down Expand Up @@ -63,7 +64,7 @@ public static Object convertFromDeltaColumnStatValue(Object value, InternalSchem
return null;
}
if (noConversionForSchema(fieldSchema)) {
return castObjectToInternalType(value, fieldSchema.getDataType());
return castObjectToInternalType(value, fieldSchema);
}
// Needs special handling for date and time.
InternalType fieldType = fieldSchema.getDataType();
Expand Down Expand Up @@ -198,7 +199,8 @@ public static Object convertFromDeltaPartitionValue(
}
}

private static Object castObjectToInternalType(Object value, InternalType valueType) {
private static Object castObjectToInternalType(Object value, InternalSchema schema) {
InternalType valueType = schema.getDataType();
switch (valueType) {
case DOUBLE:
if (value instanceof String)
Expand Down Expand Up @@ -232,7 +234,7 @@ private static Object castObjectToInternalType(Object value, InternalType valueT
}
break;
case DECIMAL:
return numberTypeToBigDecimal(value);
return numberTypeToBigDecimal(value, schema);
case LONG:
if (value instanceof Integer) {
return ((Integer) value).longValue();
Expand All @@ -242,18 +244,12 @@ private static Object castObjectToInternalType(Object value, InternalType valueT
return value;
}

private static BigDecimal numberTypeToBigDecimal(Object value) {
// BigDecimal is parsed as Integer, Long, BigInteger and double if none of the above.
if (value instanceof Integer) {
return BigDecimal.valueOf((Integer) value);
} else if (value instanceof Long) {
return BigDecimal.valueOf((Long) value);
} else if (value instanceof BigInteger) {
return new BigDecimal((BigInteger) value);
} else if (value instanceof Double) {
return BigDecimal.valueOf((Double) value);
} else {
return (BigDecimal) value;
}
private static BigDecimal numberTypeToBigDecimal(Object value, InternalSchema schema) {
// BigDecimal is parsed by converting the value to a string and setting the proper scale and
// precision.
int precision = (int) schema.getMetadata().get(InternalSchema.MetadataKey.DECIMAL_PRECISION);
int scale = (int) schema.getMetadata().get(InternalSchema.MetadataKey.DECIMAL_SCALE);
return new BigDecimal(String.valueOf(value), new MathContext(precision))
.setScale(scale, RoundingMode.UNNECESSARY);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import java.math.BigDecimal;
import java.math.BigInteger;
import java.math.RoundingMode;
import java.nio.ByteBuffer;
import java.sql.Date;
import java.util.ArrayList;
Expand Down Expand Up @@ -200,14 +201,16 @@ private static ColumnStat getColumnStatFromHudiStat(
Comparable<?> minValue = HoodieAvroUtils.unwrapAvroValueWrapper(columnStats.getMinValue());
Comparable<?> maxValue = HoodieAvroUtils.unwrapAvroValueWrapper(columnStats.getMaxValue());
if (field.getSchema().getDataType() == InternalType.DECIMAL) {
int scale =
(int) field.getSchema().getMetadata().get(InternalSchema.MetadataKey.DECIMAL_SCALE);
minValue =
minValue instanceof ByteBuffer
? convertBytesToBigDecimal((ByteBuffer) minValue, DECIMAL_WRAPPER_SCALE)
: minValue;
? convertBytesToBigDecimal((ByteBuffer) minValue, scale)
: ((BigDecimal) minValue).setScale(scale, RoundingMode.UNNECESSARY);
maxValue =
maxValue instanceof ByteBuffer
? convertBytesToBigDecimal((ByteBuffer) maxValue, DECIMAL_WRAPPER_SCALE)
: maxValue;
? convertBytesToBigDecimal((ByteBuffer) maxValue, scale)
: ((BigDecimal) maxValue).setScale(scale, RoundingMode.UNNECESSARY);
}
return getColumnStatFromValues(
minValue,
Expand All @@ -221,7 +224,9 @@ private static ColumnStat getColumnStatFromHudiStat(
private static BigDecimal convertBytesToBigDecimal(ByteBuffer value, int scale) {
byte[] bytes = new byte[value.remaining()];
value.duplicate().get(bytes);
return new BigDecimal(new BigInteger(bytes), scale);
BigDecimal serializedValue = new BigDecimal(new BigInteger(bytes), DECIMAL_WRAPPER_SCALE);
// set the scale to match the schema
return serializedValue.setScale(scale, RoundingMode.UNNECESSARY);
}

private static ColumnStat getColumnStatFromColRange(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;

import java.math.BigDecimal;
import java.math.MathContext;
import java.math.RoundingMode;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
Expand All @@ -33,6 +36,8 @@
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;

import com.google.common.collect.ImmutableMap;

import org.apache.xtable.model.schema.InternalSchema;
import org.apache.xtable.model.schema.InternalType;
import org.apache.xtable.model.schema.PartitionTransformType;
Expand Down Expand Up @@ -214,4 +219,109 @@ private static Stream<Arguments> nonNumericValuesForColStats() {
Arguments.of(Double.NaN, doubleSchema, Double.NaN),
Arguments.of(Double.POSITIVE_INFINITY, doubleSchema, Double.POSITIVE_INFINITY));
}

@ParameterizedTest
@MethodSource("decimalValues")
void parseDecimalValues(
Object deltaValue, InternalSchema fieldSchema, BigDecimal expectedOutput) {
assertEquals(
expectedOutput,
DeltaValueConverter.convertFromDeltaColumnStatValue(deltaValue, fieldSchema));
}

private static Stream<Arguments> decimalValues() {
return Stream.of(
Arguments.of(
-8.00,
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 5)
.build())
.build(),
new BigDecimal("-8.00", new MathContext(5, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
-8.00f,
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 5)
.build())
.build(),
new BigDecimal("-8.00", new MathContext(5, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
1000,
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 6)
.build())
.build(),
new BigDecimal("1000.00", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
1000L,
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 6)
.build())
.build(),
new BigDecimal("1000.00", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
"1000",
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 6)
.build())
.build(),
new BigDecimal("1000.00", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
1234.56,
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 6)
.build())
.build(),
new BigDecimal("1234.56", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
new BigDecimal("1234.56", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY),
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 6)
.build())
.build(),
new BigDecimal("1234.56", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -286,12 +286,8 @@ private void validateOutput(List<InternalDataFile> output) {
assertEquals(1, decimalColumnStat.getNumNulls());
assertEquals(2, decimalColumnStat.getNumValues());
assertTrue(decimalColumnStat.getTotalSize() > 0);
assertEquals(
new BigDecimal("1234.56"),
((BigDecimal) decimalColumnStat.getRange().getMinValue()).setScale(2));
assertEquals(
new BigDecimal("1234.56"),
((BigDecimal) decimalColumnStat.getRange().getMaxValue()).setScale(2));
assertEquals(new BigDecimal("1234.56"), decimalColumnStat.getRange().getMinValue());
assertEquals(new BigDecimal("1234.56"), decimalColumnStat.getRange().getMaxValue());
}

private HoodieRecord<HoodieAvroPayload> buildRecord(GenericRecord record) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import lombok.AccessLevel;
import lombok.NoArgsConstructor;

import com.google.common.collect.ImmutableMap;

import org.apache.xtable.model.schema.InternalField;
import org.apache.xtable.model.schema.InternalSchema;
import org.apache.xtable.model.schema.InternalType;
Expand Down Expand Up @@ -174,7 +176,16 @@ public class ColumnStatMapUtil {
private static final InternalField DECIMAL_FIELD =
InternalField.builder()
.name("decimal_field")
.schema(InternalSchema.builder().name("decimal").dataType(InternalType.DECIMAL).build())
.schema(
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 5)
.build())
.build())
.build();

private static final InternalField FLOAT_FIELD =
Expand Down Expand Up @@ -312,7 +323,7 @@ public static List<ColumnStat> getColumnStats() {
ColumnStat.builder()
.field(DECIMAL_FIELD)
.numNulls(1)
.range(Range.vector(new BigDecimal("1.0"), new BigDecimal("2.0")))
.range(Range.vector(new BigDecimal("1.00"), new BigDecimal("2.00")))
.numValues(50)
.totalSize(123)
.build();
Expand Down
Loading