From c061d5493ef3c3b7bdee59b062ad8baf056c8503 Mon Sep 17 00:00:00 2001 From: Jiashen zhang Date: Sat, 9 Dec 2023 08:13:01 -0800 Subject: [PATCH] PARQUET-1647: Implement logical type FLOAT16 (#1142) --- .../parquet/column/statistics/Statistics.java | 45 +++ .../org/apache/parquet/io/api/Binary.java | 30 ++ .../org/apache/parquet/schema/Float16.java | 259 ++++++++++++++++ .../parquet/schema/LogicalTypeAnnotation.java | 42 +++ .../parquet/schema/PrimitiveComparator.java | 16 + .../parquet/schema/PrimitiveStringifier.java | 8 + .../apache/parquet/schema/PrimitiveType.java | 14 + .../java/org/apache/parquet/schema/Types.java | 7 + .../org/apache/parquet/io/api/TestBinary.java | 46 +++ .../apache/parquet/schema/TestFloat16.java | 282 ++++++++++++++++++ .../schema/TestPrimitiveComparator.java | 28 ++ .../schema/TestPrimitiveStringifier.java | 36 +++ .../TestTypeBuildersWithLogicalTypes.java | 35 ++- .../apache/parquet/format/LogicalTypes.java | 1 + .../converter/ParquetMetadataConverter.java | 17 +- .../TestParquetMetadataConverter.java | 21 ++ .../TestFloat16ReadWriteRoundTrip.java | 202 +++++++++++++ .../statistics/TestFloat16Statistics.java | 255 ++++++++++++++++ .../parquet/statistics/TestStatistics.java | 21 +- pom.xml | 4 + 20 files changed, 1365 insertions(+), 4 deletions(-) create mode 100644 parquet-column/src/main/java/org/apache/parquet/schema/Float16.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/schema/TestFloat16.java create mode 100644 parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java create mode 100644 parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 4c70f838aa..83070d49f1 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -21,6 +21,8 @@ import java.util.Arrays; import org.apache.parquet.column.UnknownColumnTypeException; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.Float16; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.PrimitiveComparator; import org.apache.parquet.schema.PrimitiveStringifier; import org.apache.parquet.schema.PrimitiveType; @@ -138,6 +140,44 @@ public Statistics build() { } } + // Builder for FLOAT16 type to handle special cases of min/max values like NaN, -0.0, and 0.0 + private static class Float16Builder extends Builder { + private static final Binary POSITIVE_ZERO_LITTLE_ENDIAN = Binary.fromConstantByteArray(new byte[] {0x00, 0x00}); + private static final Binary NEGATIVE_ZERO_LITTLE_ENDIAN = + Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80}); + + public Float16Builder(PrimitiveType type) { + super(type); + assert type.getPrimitiveTypeName() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; + assert type.getTypeLength() == 2; + } + + @Override + public Statistics build() { + BinaryStatistics stats = (BinaryStatistics) super.build(); + if (stats.hasNonNullValue()) { + Binary bMin = stats.genericGetMin(); + Binary bMax = stats.genericGetMax(); + short min = bMin.get2BytesLittleEndian(); + short max = bMax.get2BytesLittleEndian(); + // Drop min/max values in case of NaN as the sorting order of values is undefined for this case + if (Float16.isNaN(min) || Float16.isNaN(max)) { + stats.setMinMax(POSITIVE_ZERO_LITTLE_ENDIAN, NEGATIVE_ZERO_LITTLE_ENDIAN); + ((Statistics) stats).hasNonNullValue = false; + } else { + // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped + if (min == (short) 0x0000) { + stats.setMinMax(NEGATIVE_ZERO_LITTLE_ENDIAN, bMax); + } + if (max == (short) 0x8000) { + stats.setMinMax(bMin, POSITIVE_ZERO_LITTLE_ENDIAN); + } + } + } + return stats; + } + } + private final PrimitiveType type; private final PrimitiveComparator comparator; private boolean hasNonNullValue; @@ -224,6 +264,11 @@ public static Builder getBuilderForReading(PrimitiveType type) { return new FloatBuilder(type); case DOUBLE: return new DoubleBuilder(type); + case FIXED_LEN_BYTE_ARRAY: + LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation) { + return new Float16Builder(type); + } default: return new Builder(type); } diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java index 231dd43a6b..d9a172cecf 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java @@ -24,6 +24,7 @@ import java.io.OutputStream; import java.io.Serializable; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.CharsetEncoder; @@ -85,6 +86,8 @@ private Binary() {} public abstract ByteBuffer toByteBuffer(); + public abstract short get2BytesLittleEndian(); + @Override public boolean equals(Object obj) { if (obj == null) { @@ -219,6 +222,15 @@ public ByteBuffer toByteBuffer() { return ByteBuffer.wrap(value, offset, length); } + @Override + public short get2BytesLittleEndian() { + if (length != 2) { + throw new IllegalArgumentException("length must be 2"); + } + + return (short) (((value[offset + 1] & 0xff) << 8) | (value[offset] & 0xff)); + } + @Override public void writeTo(DataOutput out) throws IOException { out.write(value, offset, length); @@ -370,6 +382,15 @@ public ByteBuffer toByteBuffer() { return ByteBuffer.wrap(value); } + @Override + public short get2BytesLittleEndian() { + if (value.length != 2) { + throw new IllegalArgumentException("length must be 2"); + } + + return (short) (((value[1] & 0xff) << 8) | (value[0] & 0xff)); + } + @Override public void writeTo(DataOutput out) throws IOException { out.write(value); @@ -547,6 +568,15 @@ public ByteBuffer toByteBuffer() { return ret; } + @Override + public short get2BytesLittleEndian() { + if (length != 2) { + throw new IllegalArgumentException("length must be 2"); + } + + return value.order(ByteOrder.LITTLE_ENDIAN).getShort(offset); + } + @Override public void writeTo(DataOutput out) throws IOException { // TODO: should not have to materialize those bytes diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java b/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java new file mode 100644 index 0000000000..6fe0e3d4c3 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java @@ -0,0 +1,259 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema; + +import org.apache.parquet.io.api.Binary; + +/** + * The class is a utility class to manipulate half-precision 16-bit + * IEEE 754 + * floating point data types (also called fp16 or binary16). A half-precision float can be + * created from or converted to single-precision floats, and is stored in a short data type. + * The IEEE 754 standard specifies an float16 as having the following format: + * + * + *

The format is laid out as follows:

+ *
+ * 1   11111   1111111111
+ * ^   --^--   -----^----
+ * sign  |          |_______ significand
+ *       |
+ *      -- exponent
+ * 
+ * Half-precision floating points can be useful to save memory and/or + * bandwidth at the expense of range and precision when compared to single-precision + * floating points (float32). + * Ref: https://android.googlesource.com/platform/libcore/+/master/luni/src/main/java/libcore/util/FP16.java + */ +public class Float16 { + // Positive infinity of type half-precision float. + private static final short POSITIVE_INFINITY = (short) 0x7c00; + // A Not-a-Number representation of a half-precision float. + private static final short NaN = (short) 0x7e00; + // The bitmask to and a number with to obtain the sign bit. + private static final int SIGN_MASK = 0x8000; + // The offset to shift by to obtain the exponent bits. + private static final int EXPONENT_SHIFT = 10; + // The bitmask to and a number shifted by EXPONENT_SHIFT right, to obtain exponent bits. + private static final int SHIFTED_EXPONENT_MASK = 0x1f; + // The bitmask to and a number with to obtain significand bits. + private static final int SIGNIFICAND_MASK = 0x3ff; + // The offset of the exponent from the actual value. + private static final int EXPONENT_BIAS = 15; + // The offset to shift by to obtain the sign bit. + private static final int SIGN_SHIFT = 15; + // The bitmask to AND with to obtain exponent and significand bits. + private static final int EXPONENT_SIGNIFICAND_MASK = 0x7fff; + + private static final int FP32_SIGN_SHIFT = 31; + private static final int FP32_EXPONENT_SHIFT = 23; + private static final int FP32_SHIFTED_EXPONENT_MASK = 0xff; + private static final int FP32_SIGNIFICAND_MASK = 0x7fffff; + private static final int FP32_EXPONENT_BIAS = 127; + private static final int FP32_QNAN_MASK = 0x400000; + private static final int FP32_DENORMAL_MAGIC = 126 << 23; + private static final float FP32_DENORMAL_FLOAT = Float.intBitsToFloat(FP32_DENORMAL_MAGIC); + + /** + * Returns true if the specified half-precision float value represents + * a Not-a-Number, false otherwise. + * + * @param h A half-precision float value + * @return True if the value is a NaN, false otherwise + * + */ + public static boolean isNaN(short h) { + return (h & EXPONENT_SIGNIFICAND_MASK) > POSITIVE_INFINITY; + } + + /** + *

Compares the two specified half-precision float values. The following + * conditions apply during the comparison:

+ * + * + * + * @param x The first half-precision float value to compare. + * @param y The second half-precision float value to compare + * + * @return The value {@code 0} if {@code x} is numerically equal to {@code y}, a + * value less than {@code 0} if {@code x} is numerically less than {@code y}, + * and a value greater than {@code 0} if {@code x} is numerically greater + * than {@code y} + * + */ + public static int compare(short x, short y) { + boolean xIsNaN = isNaN(x); + boolean yIsNaN = isNaN(y); + + if (!xIsNaN && !yIsNaN) { + int first = ((x & SIGN_MASK) != 0 ? 0x8000 - (x & 0xffff) : x & 0xffff); + int second = ((y & SIGN_MASK) != 0 ? 0x8000 - (y & 0xffff) : y & 0xffff); + // Returns true if the first half-precision float value is less + // (smaller toward negative infinity) than the second half-precision float value. + if (first < second) { + return -1; + } + + // Returns true if the first half-precision float value is greater + // (larger toward positive infinity) than the second half-precision float value. + if (first > second) { + return 1; + } + } + + // Collapse NaNs, akin to halfToIntBits(), but we want to keep + // (signed) short value types to preserve the ordering of -0.0 + // and +0.0 + short xBits = xIsNaN ? NaN : x; + short yBits = yIsNaN ? NaN : y; + return (xBits == yBits ? 0 : (xBits < yBits ? -1 : 1)); + } + + /** + * Converts the specified half-precision float value in Binary little endian into a + * single-precision float value. The following special cases are handled: + * If the input is NaN, the returned value is Float NaN. + * If the input is POSITIVE_INFINITY or NEGATIVE_INFINITY, the returned value is respectively + * Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY. + * If the input is 0 (positive or negative), the returned value is +/-0.0f. + * Otherwise, the returned value is a normalized single-precision float value. + * + * @param b The half-precision float value in Binary little endian to convert to single-precision + * @return A normalized single-precision float value + */ + static float toFloat(Binary b) { + short h = b.get2BytesLittleEndian(); + int bits = h & 0xffff; + int s = bits & SIGN_MASK; + int e = (bits >>> EXPONENT_SHIFT) & SHIFTED_EXPONENT_MASK; + int m = (bits) & SIGNIFICAND_MASK; + int outE = 0; + int outM = 0; + if (e == 0) { // Denormal or 0 + if (m != 0) { + // Convert denorm fp16 into normalized fp32 + float o = Float.intBitsToFloat(FP32_DENORMAL_MAGIC + m); + o -= FP32_DENORMAL_FLOAT; + return s == 0 ? o : -o; + } + } else { + outM = m << 13; + if (e == 0x1f) { // Infinite or NaN + outE = 0xff; + if (outM != 0) { // SNaNs are quieted + outM |= FP32_QNAN_MASK; + } + } else { + outE = e - EXPONENT_BIAS + FP32_EXPONENT_BIAS; + } + } + int out = (s << 16) | (outE << FP32_EXPONENT_SHIFT) | outM; + return Float.intBitsToFloat(out); + } + + /** + * Converts the specified single-precision float value into a + * half-precision float value. The following special cases are handled: + * + * If the input is NaN, the returned value is NaN. + * If the input is Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY, + * the returned value is respectively POSITIVE_INFINITY or NEGATIVE_INFINITY. + * If the input is 0 (positive or negative), the returned value is + * POSITIVE_ZERO or NEGATIVE_ZERO. + * If the input is a less than MIN_VALUE, the returned value + * is flushed to POSITIVE_ZERO or NEGATIVE_ZERO. + * If the input is a less than MIN_NORMAL, the returned value + * is a denorm half-precision float. + * Otherwise, the returned value is rounded to the nearest + * representable half-precision float value. + * + * @param f The single-precision float value to convert to half-precision + * @return A half-precision float value + */ + static short toFloat16(float f) { + int bits = Float.floatToRawIntBits(f); + int s = (bits >>> FP32_SIGN_SHIFT); + int e = (bits >>> FP32_EXPONENT_SHIFT) & FP32_SHIFTED_EXPONENT_MASK; + int m = (bits) & FP32_SIGNIFICAND_MASK; + int outE = 0; + int outM = 0; + if (e == 0xff) { // Infinite or NaN + outE = 0x1f; + outM = m != 0 ? 0x200 : 0; + } else { + e = e - FP32_EXPONENT_BIAS + EXPONENT_BIAS; + if (e >= 0x1f) { // Overflow + outE = 0x1f; + } else if (e <= 0) { // Underflow + if (e < -10) { + // The absolute fp32 value is less than MIN_VALUE, flush to +/-0 + } else { + // The fp32 value is a normalized float less than MIN_NORMAL, + // we convert to a denorm fp16 + m = m | 0x800000; + int shift = 14 - e; + outM = m >> shift; + int lowm = m & ((1 << shift) - 1); + int hway = 1 << (shift - 1); + // if above halfway or exactly halfway and outM is odd + if (lowm + (outM & 1) > hway) { + // Round to nearest even + // Can overflow into exponent bit, which surprisingly is OK. + // This increment relies on the +outM in the return statement below + outM++; + } + } + } else { + outE = e; + outM = m >> 13; + // if above halfway or exactly halfway and outM is odd + if ((m & 0x1fff) + (outM & 0x1) > 0x1000) { + // Round to nearest even + // Can overflow into exponent bit, which surprisingly is OK. + // This increment relies on the +outM in the return statement below + outM++; + } + } + } + // The outM is added here as the +1 increments for outM above can + // cause an overflow in the exponent bit which is OK. + return (short) ((s << SIGN_SHIFT) | (outE << EXPONENT_SHIFT) + outM); + } + + /** + * Returns a string representation of the specified half-precision + * float value. Calling this method is equivalent to calling + * Float.toString(toFloat(h)). See {@link Float#toString(float)} + * for more information on the format of the string representation. + * + * @param h A half-precision float value in binary little-endian format + * @return A string representation of the specified value + */ + static String toFloatString(Binary h) { + return Float.toString(Float16.toFloat(h)); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java index ee09c999f8..05629dd388 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java @@ -140,6 +140,12 @@ protected LogicalTypeAnnotation fromString(List params) { protected LogicalTypeAnnotation fromString(List params) { return IntervalLogicalTypeAnnotation.getInstance(); } + }, + FLOAT16 { + @Override + protected LogicalTypeAnnotation fromString(List params) { + return float16Type(); + } }; protected abstract LogicalTypeAnnotation fromString(List params); @@ -306,6 +312,10 @@ public static UUIDLogicalTypeAnnotation uuidType() { return UUIDLogicalTypeAnnotation.INSTANCE; } + public static Float16LogicalTypeAnnotation float16Type() { + return Float16LogicalTypeAnnotation.INSTANCE; + } + public static class StringLogicalTypeAnnotation extends LogicalTypeAnnotation { private static final StringLogicalTypeAnnotation INSTANCE = new StringLogicalTypeAnnotation(); @@ -951,6 +961,34 @@ PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { } } + public static class Float16LogicalTypeAnnotation extends LogicalTypeAnnotation { + private static final Float16LogicalTypeAnnotation INSTANCE = new Float16LogicalTypeAnnotation(); + public static final int BYTES = 2; + + private Float16LogicalTypeAnnotation() {} + + @Override + public OriginalType toOriginalType() { + // No OriginalType for Float16 + return null; + } + + @Override + public Optional accept(LogicalTypeAnnotationVisitor logicalTypeAnnotationVisitor) { + return logicalTypeAnnotationVisitor.visit(this); + } + + @Override + LogicalTypeToken getType() { + return LogicalTypeToken.FLOAT16; + } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.FLOAT16_STRINGIFIER; + } + } + // This logical type annotation is implemented to support backward compatibility with ConvertedType. // The new logical type representation in parquet-format doesn't have any interval type, // thus this annotation is mapped to UNKNOWN. @@ -1120,5 +1158,9 @@ default Optional visit(IntervalLogicalTypeAnnotation intervalLogicalType) { default Optional visit(MapKeyValueTypeAnnotation mapKeyValueLogicalType) { return empty(); } + + default Optional visit(Float16LogicalTypeAnnotation float16LogicalType) { + return empty(); + } } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index be84f35782..50c4acd4c9 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -277,4 +277,20 @@ public String toString() { return "BINARY_AS_SIGNED_INTEGER_COMPARATOR"; } }; + + /** + * This comparator is for comparing two float16 values represented in 2 bytes binary. + */ + static final PrimitiveComparator BINARY_AS_FLOAT16_COMPARATOR = new BinaryComparator() { + + @Override + int compareBinary(Binary b1, Binary b2) { + return Float16.compare(b1.get2BytesLittleEndian(), b2.get2BytesLittleEndian()); + } + + @Override + public String toString() { + return "BINARY_AS_FLOAT16_COMPARATOR"; + } + }; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java index 471c4b8d78..c46e94367f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java @@ -441,4 +441,12 @@ private void appendHex(byte[] array, int offset, int length, StringBuilder build } } }; + + static final PrimitiveStringifier FLOAT16_STRINGIFIER = new BinaryStringifierBase("FLOAT16_STRINGIFIER") { + + @Override + String stringifyNotNull(Binary value) { + return Float16.toFloatString(value); + } + }; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index 2867d0d46f..e74d7cde02 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -399,6 +399,12 @@ public Optional visit( public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) { return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR); } + + @Override + public Optional visit( + LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { + return of(PrimitiveComparator.BINARY_AS_FLOAT16_COMPARATOR); + } }) .orElseThrow(() -> new ShouldNeverHappenException( "No comparator logic implemented for FIXED_LEN_BYTE_ARRAY logical type: " @@ -605,6 +611,14 @@ public PrimitiveType withId(int id) { getRepetition(), primitive, length, getName(), getLogicalTypeAnnotation(), new ID(id), columnOrder); } + /** + * @param logicalType LogicalTypeAnnotation + * @return a new PrimitiveType with the same fields and a new id null + */ + public PrimitiveType withLogicalTypeAnnotation(LogicalTypeAnnotation logicalType) { + return new PrimitiveType(getRepetition(), primitive, length, getName(), logicalType, getId()); + } + /** * @return the primitive type */ diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java index 5c9dcddcfd..8821df2b82 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java @@ -462,6 +462,13 @@ public Optional visit( LogicalTypeAnnotation.UUIDLogicalTypeAnnotation.BYTES, uuidLogicalType); } + @Override + public Optional visit( + LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { + return checkFixedPrimitiveType( + LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES, float16LogicalType); + } + @Override public Optional visit( LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { diff --git a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java index 55bc64c1d2..a1a83af771 100644 --- a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java +++ b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java @@ -22,6 +22,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -268,4 +269,49 @@ public void testCompare() { assertTrue(b1.compareTo(b3) == 0); assertTrue(b3.compareTo(b1) == 0); } + + @Test + public void testGet2BytesLittleEndian() { + // ByteBufferBackedBinary: get2BytesLittleEndian + Binary b1 = Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] {0x01, 0x02})); + assertEquals((short) 0x0201, b1.get2BytesLittleEndian()); + + // ByteArrayBackedBinary: get2BytesLittleEndian + Binary b2 = Binary.fromConstantByteArray(new byte[] {0x01, 0x02}); + assertEquals((short) 0x0201, b2.get2BytesLittleEndian()); + + // ByteArraySliceBackedBinary: get2BytesLittleEndian + Binary b3 = Binary.fromConstantByteArray(new byte[] {0x00, 0x01, 0x02, 0x03}, 1, 2); + assertEquals((short) 0x0201, b3.get2BytesLittleEndian()); + } + + @Test + public void testGet2BytesLittleEndianWrongLength() { + // ByteBufferBackedBinary: get2BytesLittleEndian + Binary b1 = Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] {0x01, 0x02, 0x03})); + try { + b1.get2BytesLittleEndian(); + fail("Should have thrown an exception"); + } catch (IllegalArgumentException e) { + // expected + } + + // ByteArrayBackedBinary: get2BytesLittleEndian + Binary b2 = Binary.fromConstantByteArray(new byte[] {0x01, 0x02, 0x03}); + try { + b2.get2BytesLittleEndian(); + fail("Should have thrown an exception"); + } catch (IllegalArgumentException e) { + // expected + } + + // ByteArraySliceBackedBinary: get2BytesLittleEndian + Binary b3 = Binary.fromConstantByteArray(new byte[] {0x00, 0x01, 0x02, 0x03}, 1, 3); + try { + b3.get2BytesLittleEndian(); + fail("Should have thrown an exception"); + } catch (IllegalArgumentException e) { + // expected + } + } } diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestFloat16.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestFloat16.java new file mode 100644 index 0000000000..1c3a30f234 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestFloat16.java @@ -0,0 +1,282 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.schema; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.parquet.io.api.Binary; +import org.junit.Test; + +public class TestFloat16 { + // Smallest negative value a half-precision float may have. + private static final short LOWEST_VALUE = (short) 0xfbff; + // Maximum positive finite value a half-precision float may have. + private static final short MAX_VALUE = (short) 0x7bff; + // Smallest positive non-zero value a half-precision float may have. + private static final short MIN_VALUE = (short) 0x0001; + // Positive 0 of type half-precision float. + private static final short POSITIVE_ZERO = (short) 0x0000; + // Negative 0 of type half-precision float. + private static final short NEGATIVE_ZERO = (short) 0x8000; + // A Not-a-Number representation of a half-precision float. + private static final short NaN = (short) 0x7e00; + // Positive infinity of type half-precision float. + private static final short POSITIVE_INFINITY = (short) 0x7c00; + // Negative infinity of type half-precision float. + private static final short NEGATIVE_INFINITY = (short) 0xfc00; + + @Test + public void testFloat16ToFloat() { + // Zeroes + assertEquals(0.0f, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {0x00, 0x00})), 0.0f); + assertEquals(-0.0f, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80})), 0.0f); + // NaN + assertEquals( + Float.NaN, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0xc0, (byte) 0x7f})), 0.0f); + assertEquals( + Float.NaN, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7e})), 0.0f); + assertEquals( + Float.NaN, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7f})), 0.0f); + assertEquals( + Float.NaN, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfe})), 0.0f); + assertEquals( + Float.NaN, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xff})), 0.0f); + assertEquals( + Float.NaN, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x7f, (byte) 0x7e})), 0.0f); + assertEquals( + Float.NaN, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x7f, (byte) 0xfe})), 0.0f); + assertEquals( + Float.NaN, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0xfe})), 0.0f); + assertEquals( + Float.NaN, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7f})), 0.0f); + assertEquals( + Float.NaN, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0xff})), 0.0f); + // infinities + assertEquals( + Float.POSITIVE_INFINITY, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c})), + 0.0f); + assertEquals( + Float.NEGATIVE_INFINITY, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc})), + 0.0f); + // subnormals + assertEquals( + 5.9604645E-8f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00})), + 0.0f); + assertEquals( + -65504.0f, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0xfb})), 0.0f); + assertEquals( + +65504.0f, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b})), 0.0f); + assertEquals( + -6.097555E-5f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x83})), + 0.0f); + assertEquals( + -5.9604645E-8f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x80})), + 0.0f); + // Known values + assertEquals( + 1.0009765625f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x3c})), + 0.0f); + assertEquals(-2.0f, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0})), 0.0f); + assertEquals( + 6.1035156e-5f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x04})), + 0.0f); // Inexact + assertEquals( + 65504.0f, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b})), 0.0f); + assertEquals( + 0.33325195f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x55, (byte) 0x35})), + 0.0f); // Inexact + // Denormals (flushed to +/-0) + assertEquals( + 6.097555e-5f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x03})), + 0.0f); + assertEquals( + 5.9604645e-8f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00})), + 0.0f); // Inexact + assertEquals( + -6.097555e-5f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x83})), + 0.0f); + assertEquals( + -5.9604645e-8f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x80})), + 0.0f); // Inexact + // Miscellaneous values. In general, they're chosen to test the sign/exponent and + // exponent/mantissa boundaries + assertEquals( + +0.00050163269043f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x10})), + 0.0f); + assertEquals( + -0.00050163269043f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x90})), + 0.0f); + assertEquals( + +0.000502109527588f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1d, (byte) 0x10})), + 0.0f); + assertEquals( + -0.000502109527588f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1d, (byte) 0x90})), + 0.0f); + assertEquals( + +0.00074577331543f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x12})), + 0.0f); + assertEquals( + -0.00074577331543f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x92})), + 0.0f); + assertEquals( + +0.00100326538086f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x14})), + 0.0f); + assertEquals( + -0.00100326538086f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x94})), + 0.0f); + assertEquals( + +32.875f, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x50})), 0.0f); + assertEquals( + -32.875f, Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0xd0})), 0.0f); + // A few subnormals for good measure + assertEquals( + +1.66893005371e-06f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x00})), + 0.0f); + assertEquals( + -1.66893005371e-06f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x80})), + 0.0f); + assertEquals( + +3.21865081787e-05f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x02})), + 0.0f); + assertEquals( + -3.21865081787e-05f, + Float16.toFloat(Binary.fromConstantByteArray(new byte[] {(byte) 0x1c, (byte) 0x82})), + 0.0f); + } + + @Test + public void testFloatToFloat16() { + // Zeroes, NaN and infinities + assertEquals(POSITIVE_ZERO, Float16.toFloat16(0.0f)); + assertEquals(NEGATIVE_ZERO, Float16.toFloat16(-0.0f)); + assertEquals(NaN, Float16.toFloat16(Float.NaN)); + assertEquals(POSITIVE_INFINITY, Float16.toFloat16(Float.POSITIVE_INFINITY)); + assertEquals(NEGATIVE_INFINITY, Float16.toFloat16(Float.NEGATIVE_INFINITY)); + // Known values + assertEquals((short) 0x3c01, Float16.toFloat16(1.0009765625f)); + assertEquals((short) 0xc000, Float16.toFloat16(-2.0f)); + assertEquals((short) 0x0400, Float16.toFloat16(6.10352e-5f)); + assertEquals((short) 0x7bff, Float16.toFloat16(65504.0f)); + assertEquals((short) 0x3555, Float16.toFloat16(1.0f / 3.0f)); + // Subnormals + assertEquals((short) 0x03ff, Float16.toFloat16(6.09756e-5f)); + assertEquals(MIN_VALUE, Float16.toFloat16(5.96046e-8f)); + assertEquals((short) 0x83ff, Float16.toFloat16(-6.09756e-5f)); + assertEquals((short) 0x8001, Float16.toFloat16(-5.96046e-8f)); + // Subnormals (flushed to +/-0) + assertEquals(POSITIVE_ZERO, Float16.toFloat16(5.96046e-9f)); + assertEquals(NEGATIVE_ZERO, Float16.toFloat16(-5.96046e-9f)); + // Test for values that overflow the mantissa bits into exp bits + assertEquals((short) 0x1000, Float16.toFloat16(Float.intBitsToFloat(0x39fff000))); + assertEquals((short) 0x0400, Float16.toFloat16(Float.intBitsToFloat(0x387fe000))); + // Floats with absolute value above +/-65519 are rounded to +/-inf + // when using round-to-even + assertEquals((short) 0x7bff, Float16.toFloat16(65519.0f)); + assertEquals((short) 0x7bff, Float16.toFloat16(65519.9f)); + assertEquals(POSITIVE_INFINITY, Float16.toFloat16(65520.0f)); + assertEquals(NEGATIVE_INFINITY, Float16.toFloat16(-65520.0f)); + // Check if numbers are rounded to nearest even when they + // cannot be accurately represented by Half + assertEquals((short) 0x6800, Float16.toFloat16(2049.0f)); + assertEquals((short) 0x6c00, Float16.toFloat16(4098.0f)); + assertEquals((short) 0x7000, Float16.toFloat16(8196.0f)); + assertEquals((short) 0x7400, Float16.toFloat16(16392.0f)); + assertEquals((short) 0x7800, Float16.toFloat16(32784.0f)); + // Miscellaneous values. In general, they're chosen to test the sign/exponent and + // exponent/mantissa boundaries + assertEquals((short) 0x101c, Float16.toFloat16(+0.00050163269043f)); + assertEquals((short) 0x901c, Float16.toFloat16(-0.00050163269043f)); + assertEquals((short) 0x101d, Float16.toFloat16(+0.000502109527588f)); + assertEquals((short) 0x901d, Float16.toFloat16(-0.000502109527588f)); + assertEquals((short) 0x121c, Float16.toFloat16(+0.00074577331543f)); + assertEquals((short) 0x921c, Float16.toFloat16(-0.00074577331543f)); + assertEquals((short) 0x141c, Float16.toFloat16(+0.00100326538086f)); + assertEquals((short) 0x941c, Float16.toFloat16(-0.00100326538086f)); + assertEquals((short) 0x501c, Float16.toFloat16(+32.875f)); + assertEquals((short) 0xd01c, Float16.toFloat16(-32.875f)); + // A few subnormals for good measure + assertEquals((short) 0x001c, Float16.toFloat16(+1.66893005371e-06f)); + assertEquals((short) 0x801c, Float16.toFloat16(-1.66893005371e-06f)); + assertEquals((short) 0x021c, Float16.toFloat16(+3.21865081787e-05f)); + assertEquals((short) 0x821c, Float16.toFloat16(-3.21865081787e-05f)); + } + + @Test + public void testIsNaN() { + assertFalse(Float16.isNaN(POSITIVE_INFINITY)); + assertFalse(Float16.isNaN(NEGATIVE_INFINITY)); + assertFalse(Float16.isNaN(POSITIVE_ZERO)); + assertFalse(Float16.isNaN(NEGATIVE_ZERO)); + assertTrue(Float16.isNaN(NaN)); + assertTrue(Float16.isNaN((short) 0x7c01)); + assertTrue(Float16.isNaN((short) 0x7c18)); + assertTrue(Float16.isNaN((short) 0xfc01)); + assertTrue(Float16.isNaN((short) 0xfc98)); + assertFalse(Float16.isNaN(MAX_VALUE)); + assertFalse(Float16.isNaN(LOWEST_VALUE)); + assertFalse(Float16.isNaN(Float16.toFloat16(-128.3f))); + assertFalse(Float16.isNaN(Float16.toFloat16(128.3f))); + } + + @Test + public void testCompare() { + assertEquals(0, Float16.compare(NaN, NaN)); + assertEquals(0, Float16.compare(NaN, (short) 0xfc98)); + assertEquals(1, Float16.compare(NaN, POSITIVE_INFINITY)); + assertEquals(-1, Float16.compare(POSITIVE_INFINITY, NaN)); + assertEquals(0, Float16.compare(POSITIVE_INFINITY, POSITIVE_INFINITY)); + assertEquals(0, Float16.compare(NEGATIVE_INFINITY, NEGATIVE_INFINITY)); + assertEquals(1, Float16.compare(POSITIVE_INFINITY, NEGATIVE_INFINITY)); + assertEquals(-1, Float16.compare(NEGATIVE_INFINITY, POSITIVE_INFINITY)); + assertEquals(0, Float16.compare(POSITIVE_ZERO, POSITIVE_ZERO)); + assertEquals(0, Float16.compare(NEGATIVE_ZERO, NEGATIVE_ZERO)); + assertEquals(1, Float16.compare(POSITIVE_ZERO, NEGATIVE_ZERO)); + assertEquals(-1, Float16.compare(NEGATIVE_ZERO, POSITIVE_ZERO)); + assertEquals(0, Float16.compare(Float16.toFloat16(12.462f), Float16.toFloat16(12.462f))); + assertEquals(0, Float16.compare(Float16.toFloat16(-12.462f), Float16.toFloat16(-12.462f))); + assertEquals(1, Float16.compare(Float16.toFloat16(12.462f), Float16.toFloat16(-12.462f))); + assertEquals(-1, Float16.compare(Float16.toFloat16(-12.462f), Float16.toFloat16(12.462f))); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java index 6c96fac7a6..d3d1b15bc6 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -18,6 +18,7 @@ */ package org.apache.parquet.schema; +import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_FLOAT16_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.BOOLEAN_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.DOUBLE_COMPARATOR; @@ -273,6 +274,33 @@ public void testBinaryAsSignedIntegerComparatorWithEquals() { } } + @Test + public void testFloat16Comparator() { + Binary[] valuesInAscendingOrder = { + Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0xfc}), // -Infinity + Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x84}), // -6.109476E-5 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00}), // 5.9604645E-8 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // 65504.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}) + }; // Infinity + + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Binary bi = valuesInAscendingOrder[i]; + Binary bj = valuesInAscendingOrder[j]; + float fi = Float16.toFloat(bi); + float fj = Float16.toFloat(bj); + assertEquals(Float.compare(fi, fj), BINARY_AS_FLOAT16_COMPARATOR.compare(bi, bj)); + if (i < j) { + assertEquals(-1, Float.compare(fi, fj)); + } + } + } + } + private void testObjectComparator(PrimitiveComparator comparator, T... valuesInAscendingOrder) { for (int i = 0; i < valuesInAscendingOrder.length; ++i) { for (int j = 0; j < valuesInAscendingOrder.length; ++j) { diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java index 660e1b6bef..3101ecea0d 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java @@ -347,6 +347,42 @@ public void testDecimalStringifier() { checkThrowingUnsupportedException(stringifier, Integer.TYPE, Long.TYPE, Binary.class); } + @Test + public void testFloat16Stringifier() { + PrimitiveStringifier stringifier = PrimitiveStringifier.FLOAT16_STRINGIFIER; + + // Zeroes, NaN and infinities + assertEquals("0.0", stringifier.stringify(toBinary(0x00, 0x00))); + assertEquals("-0.0", stringifier.stringify(toBinary(0x00, 0x80))); + assertEquals(Float.toString(Float.NaN), stringifier.stringify(toBinary(0x00, 0x7e))); + assertEquals(Float.toString(Float.POSITIVE_INFINITY), stringifier.stringify(toBinary(0x00, 0x7c))); + assertEquals(Float.toString(Float.NEGATIVE_INFINITY), stringifier.stringify(toBinary(0x00, 0xfc))); + + // Known values + assertEquals("1.0009766", stringifier.stringify(toBinary(0x01, 0x3c))); + assertEquals("-2.0", stringifier.stringify(toBinary(0x00, 0xc0))); + assertEquals("6.1035156E-5", stringifier.stringify(toBinary(0x00, 0x04))); + assertEquals("65504.0", stringifier.stringify(toBinary(0xff, 0x7b))); + assertEquals("0.33325195", stringifier.stringify(toBinary(0x55, 0x35))); + + // Subnormals + assertEquals("6.097555E-5", stringifier.stringify(toBinary(0xff, 0x03))); + assertEquals("5.9604645E-8", stringifier.stringify(toBinary(0x01, 0x00))); + assertEquals("-6.097555E-5", stringifier.stringify(toBinary(0xff, 0x83))); + assertEquals("-5.9604645E-8", stringifier.stringify(toBinary(0x01, 0x80))); + + // Floats with absolute value above +/-65519 are rounded to +/-inf + // when using round-to-even + assertEquals("65504.0", stringifier.stringify(toBinary(0xff, 0x7b))); + + // Check if numbers are rounded to nearest even when they + // cannot be accurately represented by Half + assertEquals("2048.0", stringifier.stringify(toBinary(0x00, 0x68))); + assertEquals("4096.0", stringifier.stringify(toBinary(0x00, 0x6c))); + + checkThrowingUnsupportedException(stringifier, Integer.TYPE, Long.TYPE, Binary.class); + } + @Test public void testUUIDStringifier() { PrimitiveStringifier stringifier = PrimitiveStringifier.UUID_STRINGIFIER; diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java index ef664cb241..54853e8138 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java @@ -24,6 +24,7 @@ import static org.apache.parquet.schema.LogicalTypeAnnotation.bsonType; import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType; import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.float16Type; import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; import static org.apache.parquet.schema.LogicalTypeAnnotation.jsonType; import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; @@ -212,9 +213,19 @@ public void testBinaryAnnotations() { } } + @Test + public void testFloat16Annotations() { + LogicalTypeAnnotation type = float16Type(); + PrimitiveType expected = new PrimitiveType(REQUIRED, FIXED_LEN_BYTE_ARRAY, 2, "col", type, null); + PrimitiveType string = + Types.required(FIXED_LEN_BYTE_ARRAY).as(type).length(2).named("col"); + Assert.assertEquals(expected, string); + } + @Test public void testBinaryAnnotationsRejectsNonBinary() { - LogicalTypeAnnotation[] types = new LogicalTypeAnnotation[] {stringType(), jsonType(), bsonType()}; + LogicalTypeAnnotation[] types = + new LogicalTypeAnnotation[] {stringType(), jsonType(), bsonType(), float16Type()}; for (final LogicalTypeAnnotation logicalType : types) { PrimitiveTypeName[] nonBinary = new PrimitiveTypeName[] {BOOLEAN, INT32, INT64, INT96, DOUBLE, FLOAT}; for (final PrimitiveTypeName type : nonBinary) { @@ -440,6 +451,28 @@ public void testUUIDLogicalType() { () -> Types.required(BINARY).as(uuidType()).named("uuid_field").toString()); } + @Test + public void testFloat16LogicalType() { + assertEquals( + "required fixed_len_byte_array(2) float16_field (FLOAT16)", + Types.required(FIXED_LEN_BYTE_ARRAY) + .length(2) + .as(float16Type()) + .named("float16_field") + .toString()); + + assertThrows("Should fail with invalid length", IllegalStateException.class, () -> Types.required( + FIXED_LEN_BYTE_ARRAY) + .length(10) + .as(float16Type()) + .named("float16_field") + .toString()); + assertThrows("Should fail with invalid type", IllegalStateException.class, () -> Types.required(BINARY) + .as(float16Type()) + .named("float16_field") + .toString()); + } + /** * A convenience method to avoid a large number of @Test(expected=...) tests * diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java index 717cd97cd4..b2d70c9247 100644 --- a/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java +++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java @@ -52,4 +52,5 @@ public static LogicalType DECIMAL(int scale, int precision) { public static final LogicalType UNKNOWN = LogicalType.UNKNOWN(new NullType()); public static final LogicalType JSON = LogicalType.JSON(new JsonType()); public static final LogicalType BSON = LogicalType.BSON(new BsonType()); + public static final LogicalType FLOAT16 = LogicalType.FLOAT16(new Float16Type()); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 2d416fc529..47ab0649da 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -83,6 +83,7 @@ import org.apache.parquet.format.EnumType; import org.apache.parquet.format.FieldRepetitionType; import org.apache.parquet.format.FileMetaData; +import org.apache.parquet.format.Float16Type; import org.apache.parquet.format.IntType; import org.apache.parquet.format.JsonType; import org.apache.parquet.format.KeyValue; @@ -508,6 +509,11 @@ public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) { return of(LogicalType.UUID(new UUIDType())); } + @Override + public Optional visit(LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { + return of(LogicalType.FLOAT16(new Float16Type())); + } + @Override public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { return of(LogicalType.UNKNOWN(new NullType())); @@ -876,7 +882,8 @@ enum SortOrder { private static final Set STRING_TYPES = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( LogicalTypeAnnotation.StringLogicalTypeAnnotation.class, LogicalTypeAnnotation.EnumLogicalTypeAnnotation.class, - LogicalTypeAnnotation.JsonLogicalTypeAnnotation.class))); + LogicalTypeAnnotation.JsonLogicalTypeAnnotation.class, + LogicalTypeAnnotation.Float16LogicalTypeAnnotation.class))); /** * Returns whether to use signed order min and max with a type. It is safe to @@ -973,6 +980,12 @@ public Optional visit( return of(SortOrder.UNSIGNED); } + @Override + public Optional visit( + LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { + return of(SortOrder.SIGNED); + } + @Override public Optional visit( LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { @@ -1149,6 +1162,8 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { return LogicalTypeAnnotation.timestampType(timestamp.isAdjustedToUTC, convertTimeUnit(timestamp.unit)); case UUID: return LogicalTypeAnnotation.uuidType(); + case FLOAT16: + return LogicalTypeAnnotation.float16Type(); default: throw new RuntimeException("Unknown logical type " + type); } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 5efe867811..3be9f6fdee 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -1020,6 +1020,27 @@ private void testUseStatsWithSignedSortOrder(StatsHelper helper) { } } + @Test + public void testFloat16Stats() { + Statistics stats = Statistics.createStats( + new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, 2, "float16") + .withLogicalTypeAnnotation(LogicalTypeAnnotation.float16Type())); + stats.updateStats(toBinary(0xff, 0x03)); + stats.updateStats(toBinary(0xff, 0x7b)); + String expectedMinStr = "6.097555E-5"; + String expectedMaxStr = "65504.0"; + assertEquals(expectedMinStr, stats.minAsString()); + assertEquals(expectedMaxStr, stats.maxAsString()); + } + + private Binary toBinary(int... bytes) { + byte[] array = new byte[bytes.length]; + for (int i = 0; i < array.length; ++i) { + array[i] = (byte) bytes[i]; + } + return Binary.fromConstantByteArray(array); + } + @Test public void testMissingValuesFromStats() { ParquetMetadataConverter converter = new ParquetMetadataConverter(); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java new file mode 100644 index 0000000000..9bbc6d1c64 --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.statistics; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.float16Type; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.Preconditions; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.GroupFactory; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class TestFloat16ReadWriteRoundTrip { + + @Rule + public TemporaryFolder temp = new TemporaryFolder(); + + private Binary[] valuesInAscendingOrder = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}), // -Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // -6.109476E-5 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00}), // 5.9604645E-8 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // 65504.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}) + }; // Infinity + + private Binary[] valuesInDescendingOrder = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}), // Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // 65504.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00}), // 5.9604645E-8 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // -6.109476E-5 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}) + }; // -Infinity + + private Binary[] valuesUndefinedOrder = { + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // 65504.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}), // Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00}), // 5.9604645E-8 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // -6.109476E-5 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}) + }; // -Infinity + + private Binary[] valuesAllPositiveZero = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}) + }; // +0 + + private Binary[] valuesAllNegativeZero = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}) + }; // -0 + + private Binary[] valuesWithNaN = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // 65504.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00}), // 5.9604645E-8 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7e}) + }; // NaN + + private Binary[] valuesInAscendingOrderMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}), // -Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}) + }; // Infinity + + private Binary[] valuesInDescendingOrderMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}), // -Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}) + }; // Infinity + + private Binary[] valuesUndefinedOrderMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}), // -Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}) + }; // Infinity + + private Binary[] valuesAllPositiveZeroMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}) + }; // +0 + + private Binary[] valuesAllNegativeZeroMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}) + }; // -0 + + private Binary[] valuesWithNaNMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7e}) + }; // NaN + + @Test + public void testFloat16ColumnIndex() throws IOException { + List testValues = Arrays.asList( + valuesInAscendingOrder, + valuesInDescendingOrder, + valuesUndefinedOrder, + valuesAllPositiveZero, + valuesAllNegativeZero, + valuesWithNaN); + List expectedValues = Arrays.asList( + valuesInAscendingOrderMinMax, + valuesInDescendingOrderMinMax, + valuesUndefinedOrderMinMax, + valuesAllPositiveZeroMinMax, + valuesAllNegativeZeroMinMax, + valuesWithNaNMinMax); + + for (int i = 0; i < testValues.size(); i++) { + MessageType schema = Types.buildMessage() + .required(FIXED_LEN_BYTE_ARRAY) + .as(float16Type()) + .length(2) + .named("col_float16") + .named("msg"); + + Configuration conf = new Configuration(); + GroupWriteSupport.setSchema(schema, conf); + GroupFactory factory = new SimpleGroupFactory(schema); + Path path = newTempPath(); + try (ParquetWriter writer = ExampleParquetWriter.builder(path) + .withConf(conf) + .withDictionaryEncoding(false) + .build()) { + + for (Binary value : testValues.get(i)) { + writer.write(factory.newGroup().append("col_float16", value)); + } + } + + try (ParquetFileReader reader = + ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + ColumnChunkMetaData column = + reader.getFooter().getBlocks().get(0).getColumns().get(0); + ColumnIndex index = reader.readColumnIndex(column); + assertEquals(Collections.singletonList(expectedValues.get(i)[0]), toFloat16List(index.getMinValues())); + assertEquals(Collections.singletonList(expectedValues.get(i)[1]), toFloat16List(index.getMaxValues())); + } + } + } + + private Path newTempPath() throws IOException { + File file = temp.newFile(); + Preconditions.checkArgument(file.delete(), "Could not remove temp file"); + return new Path(file.getAbsolutePath()); + } + + private static List toFloat16List(List buffers) { + return buffers.stream() + .map(buffer -> Binary.fromConstantByteArray(buffer.array())) + .collect(Collectors.toList()); + } +} diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java new file mode 100644 index 0000000000..5e82740a4a --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.statistics; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.float16Type; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.junit.Assert.assertArrayEquals; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.Preconditions; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.GroupFactory; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.Float16; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class TestFloat16Statistics { + + @Rule + public TemporaryFolder temp = new TemporaryFolder(); + + private Binary[] valuesInAscendingOrder = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}), // -Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // -6.109476E-5 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00}), // 5.9604645E-8 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // 65504.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}) + }; // Infinity + + private Binary[] valuesInAscendingOrderMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}), // -Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}) + }; // Infinity + + private Binary[] valuesInDescendingOrder = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}), // Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // 65504.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00}), // 5.9604645E-8 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // -6.109476E-5 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}) + }; // -Infinity + + private Binary[] valuesInDescendingOrderMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}), // -Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}) + }; // Infinity + + private Binary[] valuesUndefinedOrder = { + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // 65504.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}), // Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00}), // 5.9604645E-8 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // -6.109476E-5 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}) + }; // -Infinity + + private Binary[] valuesUndefinedOrderMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xfc}), // -Infinity + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c}) + }; // Infinity + + private Binary[] valuesAllPositiveZero = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}) + }; // +0 + + // Float16Builder: Updating min to -0.0 to ensure that no 0.0 values would be skipped + private Binary[] valuesAllPositiveStatsZeroMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}) + }; // +0 + + private Binary[] valuesAllNegativeZero = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}) + }; // -0 + + // Float16Builder: Updating max to +0.0 to ensure that no 0.0 values would be skipped + private Binary[] valuesAllNegativeStatsZeroMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}) + }; // +0 + + private Binary[] valuesWithNaN = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // -2.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0x7b}), // 65504.0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x01, (byte) 0x00}), // 5.9604645E-8 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7e}) + }; // NaN + + // Float16Builder: Drop min/max values in case of NaN as the sorting order of values is undefined + private Binary[] valuesWithNaNStatsMinMax = { + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0 + Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}) + }; // -0 + + @Test + public void testFloat16StatisticsMultipleCases() throws IOException { + List testValues = Arrays.asList( + valuesInAscendingOrder, + valuesInDescendingOrder, + valuesUndefinedOrder, + valuesAllPositiveZero, + valuesAllNegativeZero, + valuesWithNaN); + List expectedValues = Arrays.asList( + valuesInAscendingOrderMinMax, + valuesInDescendingOrderMinMax, + valuesUndefinedOrderMinMax, + valuesAllPositiveStatsZeroMinMax, + valuesAllNegativeStatsZeroMinMax, + valuesWithNaNStatsMinMax); + + for (int i = 0; i < testValues.size(); ++i) { + MessageType schema = Types.buildMessage() + .required(FIXED_LEN_BYTE_ARRAY) + .as(float16Type()) + .length(2) + .named("col_float16") + .named("msg"); + + Configuration conf = new Configuration(); + GroupWriteSupport.setSchema(schema, conf); + + GroupFactory factory = new SimpleGroupFactory(schema); + Path path = newTempPath(); + try (ParquetWriter writer = ExampleParquetWriter.builder(path) + .withConf(conf) + .withDictionaryEncoding(false) + .build()) { + + for (Binary value : testValues.get(i)) { + writer.write(factory.newGroup().append("col_float16", value)); + } + } + + try (ParquetFileReader reader = + ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + ColumnChunkMetaData column = + reader.getFooter().getBlocks().get(0).getColumns().get(0); + Statistics statistics = column.getStatistics(); + + assertArrayEquals(expectedValues.get(i)[0].getBytes(), statistics.getMinBytes()); + assertArrayEquals(expectedValues.get(i)[1].getBytes(), statistics.getMaxBytes()); + } + } + } + + @Test + public void testFloat16Statistics() throws IOException { + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + int minIndex = i; + int maxIndex = j; + + if (Float16.compare( + valuesInAscendingOrder[i].get2BytesLittleEndian(), + valuesInAscendingOrder[j].get2BytesLittleEndian()) + > 0) { + minIndex = j; + maxIndex = i; + } + + // Refer to Float16Builder class + if (valuesInAscendingOrder[minIndex].get2BytesLittleEndian() == (short) 0x0000) { + minIndex = 3; + } + if (valuesInAscendingOrder[maxIndex].get2BytesLittleEndian() == (short) 0x8000) { + maxIndex = 4; + } + + MessageType schema = Types.buildMessage() + .required(FIXED_LEN_BYTE_ARRAY) + .as(float16Type()) + .length(2) + .named("col_float16") + .named("msg"); + + Configuration conf = new Configuration(); + GroupWriteSupport.setSchema(schema, conf); + + GroupFactory factory = new SimpleGroupFactory(schema); + Path path = newTempPath(); + try (ParquetWriter writer = ExampleParquetWriter.builder(path) + .withConf(conf) + .withDictionaryEncoding(false) + .build()) { + writer.write(factory.newGroup().append("col_float16", valuesInAscendingOrder[i])); + writer.write(factory.newGroup().append("col_float16", valuesInAscendingOrder[j])); + } + + try (ParquetFileReader reader = + ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + ColumnChunkMetaData column = + reader.getFooter().getBlocks().get(0).getColumns().get(0); + Statistics statistics = column.getStatistics(); + + assertArrayEquals(valuesInAscendingOrder[minIndex].getBytes(), statistics.getMinBytes()); + assertArrayEquals(valuesInAscendingOrder[maxIndex].getBytes(), statistics.getMaxBytes()); + } + } + } + } + + private Path newTempPath() throws IOException { + File file = temp.newFile(); + Preconditions.checkArgument(file.delete(), "Could not remove temp file"); + return new Path(file.getAbsolutePath()); + } +} diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java index 0904b6537e..8562cf9337 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java @@ -57,6 +57,8 @@ import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.PrimitiveConverter; +import org.apache.parquet.schema.Float16; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; @@ -322,7 +324,8 @@ public DataContext( new RandomValues.LongGenerator(random.nextLong()), new RandomValues.LongGenerator(random.nextLong()), new RandomValues.LongGenerator(random.nextLong()), - new RandomValues.FixedGenerator(random.nextLong(), 12)); + new RandomValues.FixedGenerator(random.nextLong(), 12), + new RandomValues.FixedGenerator(random.nextLong(), 2)); } private static MessageType buildSchema(long seed) { @@ -388,7 +391,11 @@ private static MessageType buildSchema(long seed) { Types.optional(FIXED_LEN_BYTE_ARRAY) .length(12) .as(OriginalType.INTERVAL) - .named("interval")); + .named("interval"), + Types.optional(FIXED_LEN_BYTE_ARRAY) + .length(2) + .named("float16") + .withLogicalTypeAnnotation(LogicalTypeAnnotation.float16Type())); } private static int calculatePrecision(int byteCnt) { @@ -410,6 +417,16 @@ public void write(ParquetWriter writer) throws IOException { switch (type.asPrimitiveType().getPrimitiveTypeName()) { case BINARY: case FIXED_LEN_BYTE_ARRAY: + if (type.getLogicalTypeAnnotation() + instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation) { + Binary b = ((RandomBinaryBase) generator).nextBinaryValue(); + // return smallest negative value a half-precision float when it is NaN + Binary v = Float16.isNaN(b.get2BytesLittleEndian()) + ? b + : Binary.fromConstantByteArray(new byte[] {(byte) 0xff, (byte) 0xfb}); + group.append(type.getName(), v); + break; + } case INT96: group.append(type.getName(), ((RandomBinaryBase) generator).nextBinaryValue()); break; diff --git a/pom.xml b/pom.xml index 13d188d8be..88a02d621f 100644 --- a/pom.xml +++ b/pom.xml @@ -510,6 +510,7 @@ thrift-${thrift.version}/** thrift-${thrift.version}.tar.gz **/dependency-reduced-pom.xml + **/*.rej @@ -596,6 +597,9 @@ org.apache.parquet.arrow.schema.SchemaMapping + + org.apache.parquet.io.api.Binary#get2BytesLittleEndian() + org.apache.parquet.schema.LogicalTypeAnnotation$Float16LogicalTypeAnnotation#accept(org.apache.parquet.schema.LogicalTypeAnnotation$LogicalTypeAnnotationVisitor)