PARQUET-1647: Implement logical type FLOAT16

apache · Oct 27, 2023 · 447dd5b · 447dd5b
1 parent 69eaefa
commit 447dd5b
Show file tree

Hide file tree

Showing 20 changed files with 1,176 additions and 9 deletions.
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
@@ -21,12 +21,13 @@
 import java.util.Arrays;
 import org.apache.parquet.column.UnknownColumnTypeException;
 import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
 import org.apache.parquet.schema.PrimitiveComparator;
 import org.apache.parquet.schema.PrimitiveStringifier;
 import org.apache.parquet.schema.PrimitiveType;
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
 import org.apache.parquet.schema.Type;
-
+import org.apache.parquet.schema.Float16;
 
 /**
  * Statistics class to keep track of statistics in parquet pages and column chunks
@@ -139,6 +140,43 @@ public Statistics<?> build() {
     }
   }
 
+  // Builder for FLOAT16 type to handle special cases of min/max values like NaN, -0.0, and 0.0
+  private static class Float16Builder extends Builder {
+    private final static Binary POSITIVE_ZERO_LITTLE_ENDIAN = Binary.fromConstantByteArray(new byte[] {0x00, 0x00});
+    private final static Binary NEGATIVE_ZERO_LITTLE_ENDIAN = Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80});
+
+    public Float16Builder(PrimitiveType type) {
+      super(type);
+      assert type.getPrimitiveTypeName() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
+      assert type.getTypeLength() == 2;
+    }
+
+    @Override
+    public Statistics<?> build() {
+      BinaryStatistics stats = (BinaryStatistics) super.build();
+      if (stats.hasNonNullValue()) {
+        Binary bMin = stats.genericGetMin();
+        Binary bMax = stats.genericGetMax();
+        short min = bMin.get2BytesLittleEndian();
+        short max = bMax.get2BytesLittleEndian();
+        // Drop min/max values in case of NaN as the sorting order of values is undefined for this case
+        if (Float16.isNaN(min) || Float16.isNaN(max)) {
+          stats.setMinMax(POSITIVE_ZERO_LITTLE_ENDIAN, NEGATIVE_ZERO_LITTLE_ENDIAN);
+          ((Statistics<?>) stats).hasNonNullValue = false;
+        } else {
+          // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
+          if (min == (short) 0x0000) {
+            stats.setMinMax(NEGATIVE_ZERO_LITTLE_ENDIAN, bMax);
+          }
+          if (max == (short) 0x8000) {
+            stats.setMinMax(bMin, POSITIVE_ZERO_LITTLE_ENDIAN);
+          }
+        }
+      }
+      return stats;
+    }
+  }
+
   private final PrimitiveType type;
   private final PrimitiveComparator<T> comparator;
   private boolean hasNonNullValue;
@@ -226,6 +264,11 @@ public static Builder getBuilderForReading(PrimitiveType type) {
         return new FloatBuilder(type);
       case DOUBLE:
         return new DoubleBuilder(type);
+      case FIXED_LEN_BYTE_ARRAY:
+        LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation();
+        if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation) {
+          return new Float16Builder(type);
+        }
       default:
         return new Builder(type);
     }

diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
@@ -24,6 +24,7 @@
 import java.io.OutputStream;
 import java.io.Serializable;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.CharBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.CharsetEncoder;
@@ -85,6 +86,8 @@ private Binary() { }
 
   abstract public ByteBuffer toByteBuffer();
 
+  abstract public short get2BytesLittleEndian();
+
   @Override
   public boolean equals(Object obj) {
     if (obj == null) {
@@ -218,6 +221,15 @@ public ByteBuffer toByteBuffer() {
       return ByteBuffer.wrap(value, offset, length);
     }
 
+    @Override
+    public short get2BytesLittleEndian() {
+      if (length != 2) {
+        throw new IllegalArgumentException("length must be 2");
+      }
+
+      return (short) (((value[offset + 1] & 0xff) << 8) | (value[offset] & 0xff));
+    }
+
     @Override
     public void writeTo(DataOutput out) throws IOException {
       out.write(value, offset, length);
@@ -371,6 +383,15 @@ public ByteBuffer toByteBuffer() {
       return ByteBuffer.wrap(value);
     }
 
+    @Override
+    public short get2BytesLittleEndian() {
+      if (value.length != 2) {
+        throw new IllegalArgumentException("length must be 2");
+      }
+
+      return (short) (((value[1] & 0xff) << 8) | (value[0] & 0xff));
+    }
+
     @Override
     public void writeTo(DataOutput out) throws IOException {
       out.write(value);
@@ -547,6 +568,15 @@ public ByteBuffer toByteBuffer() {
       return ret;
     }
 
+    @Override
+    public short get2BytesLittleEndian() {
+      if (length != 2) {
+        throw new IllegalArgumentException("length must be 2");
+      }
+
+      return value.order(ByteOrder.LITTLE_ENDIAN).getShort(offset);
+    }
+
     @Override
     public void writeTo(DataOutput out) throws IOException {
       // TODO: should not have to materialize those bytes

diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java b/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java
@@ -0,0 +1,259 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.schema;
+
+import org.apache.parquet.io.api.Binary;
+
+/**
+ * The class is a utility class to manipulate half-precision 16-bit
+ * <a href="https://en.wikipedia.org/wiki/Half-precision_floating-point_format">IEEE 754</a>
+ * floating point data types (also called fp16 or binary16). A half-precision float can be
+ * created from or converted to single-precision floats, and is stored in a short data type.
+ * The IEEE 754 standard specifies an float16 as having the following format:
+ * <ul>
+ * <li>Sign bit: 1 bit</li>
+ * <li>Exponent width: 5 bits</li>
+ * <li>Significand: 10 bits</li>
+ * </ul>
+ *
+ * <p>The format is laid out as follows:</p>
+ * <pre>
+ * 1   11111   1111111111
+ * ^   --^--   -----^----
+ * sign  |          |_______ significand
+ *       |
+ *      -- exponent
+ * </pre>
+ * Half-precision floating points can be useful to save memory and/or
+ * bandwidth at the expense of range and precision when compared to single-precision
+ * floating points (float32).
+ * Ref: https://android.googlesource.com/platform/libcore/+/master/luni/src/main/java/libcore/util/FP16.java
+ */
+public class Float16 {
+  // Positive infinity of type half-precision float.
+  private static final short POSITIVE_INFINITY = (short) 0x7c00;
+  // A Not-a-Number representation of a half-precision float.
+  private static final short NaN = (short) 0x7e00;
+  // The bitmask to and a number with to obtain the sign bit.
+  private static final int SIGN_MASK                 = 0x8000;
+  // The offset to shift by to obtain the exponent bits.
+  private static final int EXPONENT_SHIFT            = 10;
+  // The bitmask to and a number shifted by EXPONENT_SHIFT right, to obtain exponent bits.
+  private static final int SHIFTED_EXPONENT_MASK     = 0x1f;
+  // The bitmask to and a number with to obtain significand bits.
+  private static final int SIGNIFICAND_MASK          = 0x3ff;
+  // The offset of the exponent from the actual value.
+  private static final int EXPONENT_BIAS             = 15;
+  // The offset to shift by to obtain the sign bit.
+  private static final int SIGN_SHIFT                = 15;
+  // The bitmask to AND with to obtain exponent and significand bits.
+  private static final int EXPONENT_SIGNIFICAND_MASK = 0x7fff;
+
+  private static final int FP32_SIGN_SHIFT            = 31;
+  private static final int FP32_EXPONENT_SHIFT        = 23;
+  private static final int FP32_SHIFTED_EXPONENT_MASK = 0xff;
+  private static final int FP32_SIGNIFICAND_MASK      = 0x7fffff;
+  private static final int FP32_EXPONENT_BIAS         = 127;
+  private static final int FP32_QNAN_MASK             = 0x400000;
+  private static final int FP32_DENORMAL_MAGIC = 126 << 23;
+  private static final float FP32_DENORMAL_FLOAT = Float.intBitsToFloat(FP32_DENORMAL_MAGIC);
+
+  /**
+   * Returns true if the specified half-precision float value represents
+   * a Not-a-Number, false otherwise.
+   *
+   * @param h A half-precision float value
+   * @return True if the value is a NaN, false otherwise
+   *
+   */
+  public static boolean isNaN(short h) {
+    return (h & EXPONENT_SIGNIFICAND_MASK) > POSITIVE_INFINITY;
+  }
+
+  /**
+   * <p>Compares the two specified half-precision float values. The following
+   * conditions apply during the comparison:</p>
+   *
+   * <ul>
+   * <li>NaN is considered by this method to be equal to itself and greater
+   * than all other half-precision float values (including {@code #POSITIVE_INFINITY})</li>
+   * <li>POSITIVE_ZERO is considered by this method to be greater than NEGATIVE_ZERO.</li>
+   * </ul>
+   *
+   * @param x The first half-precision float value to compare.
+   * @param y The second half-precision float value to compare
+   *
+   * @return  The value {@code 0} if {@code x} is numerically equal to {@code y}, a
+   *          value less than {@code 0} if {@code x} is numerically less than {@code y},
+   *          and a value greater than {@code 0} if {@code x} is numerically greater
+   *          than {@code y}
+   *
+   */
+  public static int compare(short x, short y) {
+    boolean xIsNaN = isNaN(x);
+    boolean yIsNaN = isNaN(y);
+
+    if (!xIsNaN && !yIsNaN) {
+      int first = ((x & SIGN_MASK) != 0 ? 0x8000 - (x & 0xffff) : x & 0xffff);
+      int second = ((y & SIGN_MASK) != 0 ? 0x8000 - (y & 0xffff) : y & 0xffff);
+      // Returns true if the first half-precision float value is less
+      // (smaller toward negative infinity) than the second half-precision float value.
+      if (first < second) {
+        return -1;
+      }
+
+      // Returns true if the first half-precision float value is greater
+      // (larger toward positive infinity) than the second half-precision float value.
+      if (first > second) {
+        return 1;
+      }
+    }
+
+    // Collapse NaNs, akin to halfToIntBits(), but we want to keep
+    // (signed) short value types to preserve the ordering of -0.0
+    // and +0.0
+    short xBits = xIsNaN ? NaN : x;
+    short yBits = yIsNaN ? NaN : y;
+    return (xBits == yBits ? 0 : (xBits < yBits ? -1 : 1));
+  }
+
+  /**
+   * Converts the specified half-precision float value in Binary little endian into a
+   * single-precision float value. The following special cases are handled:
+   * If the input is NaN, the returned value is Float NaN.
+   * If the input is POSITIVE_INFINITY or NEGATIVE_INFINITY, the returned value is respectively
+   *   Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY.
+   * If the input is 0 (positive or negative), the returned value is +/-0.0f.
+   * Otherwise, the returned value is a normalized single-precision float value.
+   *
+   * @param b The half-precision float value in Binary little endian to convert to single-precision
+   * @return A normalized single-precision float value
+   */
+   static float toFloat(Binary b) {
+    short h = b.get2BytesLittleEndian();
+    int bits = h & 0xffff;
+    int s = bits & SIGN_MASK;
+    int e = (bits >>> EXPONENT_SHIFT) & SHIFTED_EXPONENT_MASK;
+    int m = (bits                        ) & SIGNIFICAND_MASK;
+    int outE = 0;
+    int outM = 0;
+    if (e == 0) { // Denormal or 0
+      if (m != 0) {
+        // Convert denorm fp16 into normalized fp32
+        float o = Float.intBitsToFloat(FP32_DENORMAL_MAGIC + m);
+        o -= FP32_DENORMAL_FLOAT;
+        return s == 0 ? o : -o;
+      }
+    } else {
+      outM = m << 13;
+      if (e == 0x1f) { // Infinite or NaN
+        outE = 0xff;
+        if (outM != 0) { // SNaNs are quieted
+          outM |= FP32_QNAN_MASK;
+        }
+      } else {
+        outE = e - EXPONENT_BIAS + FP32_EXPONENT_BIAS;
+      }
+    }
+    int out = (s << 16) | (outE << FP32_EXPONENT_SHIFT) | outM;
+    return Float.intBitsToFloat(out);
+  }
+
+  /**
+   * Converts the specified single-precision float value into a
+   * half-precision float value. The following special cases are handled:
+   *
+   * If the input is NaN, the returned value is NaN.
+   * If the input is Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY,
+   *   the returned value is respectively POSITIVE_INFINITY or NEGATIVE_INFINITY.
+   * If the input is 0 (positive or negative), the returned value is
+   *   POSITIVE_ZERO or NEGATIVE_ZERO.
+   * If the input is a less than MIN_VALUE, the returned value
+   *   is flushed to POSITIVE_ZERO or NEGATIVE_ZERO.
+   * If the input is a less than MIN_NORMAL, the returned value
+   *   is a denorm half-precision float.
+   * Otherwise, the returned value is rounded to the nearest
+   *   representable half-precision float value.
+   *
+   * @param f The single-precision float value to convert to half-precision
+   * @return A half-precision float value
+   */
+  static short toFloat16(float f) {
+    int bits = Float.floatToRawIntBits(f);
+    int s = (bits >>> FP32_SIGN_SHIFT    );
+    int e = (bits >>> FP32_EXPONENT_SHIFT) & FP32_SHIFTED_EXPONENT_MASK;
+    int m = (bits                        ) & FP32_SIGNIFICAND_MASK;
+    int outE = 0;
+    int outM = 0;
+    if (e == 0xff) { // Infinite or NaN
+      outE = 0x1f;
+      outM = m != 0 ? 0x200 : 0;
+    } else {
+      e = e - FP32_EXPONENT_BIAS + EXPONENT_BIAS;
+      if (e >= 0x1f) { // Overflow
+        outE = 0x1f;
+      } else if (e <= 0) { // Underflow
+        if (e < -10) {
+          // The absolute fp32 value is less than MIN_VALUE, flush to +/-0
+        } else {
+          // The fp32 value is a normalized float less than MIN_NORMAL,
+          // we convert to a denorm fp16
+          m = m | 0x800000;
+          int shift = 14 - e;
+          outM = m >> shift;
+          int lowm = m & ((1 << shift) - 1);
+          int hway = 1 << (shift - 1);
+          // if above halfway or exactly halfway and outM is odd
+          if (lowm + (outM & 1) > hway){
+            // Round to nearest even
+            // Can overflow into exponent bit, which surprisingly is OK.
+            // This increment relies on the +outM in the return statement below
+            outM++;
+          }
+        }
+      } else {
+        outE = e;
+        outM = m >> 13;
+        // if above halfway or exactly halfway and outM is odd
+        if ((m & 0x1fff) + (outM & 0x1) > 0x1000) {
+          // Round to nearest even
+          // Can overflow into exponent bit, which surprisingly is OK.
+          // This increment relies on the +outM in the return statement below
+          outM++;
+        }
+      }
+    }
+    // The outM is added here as the +1 increments for outM above can
+    // cause an overflow in the exponent bit which is OK.
+    return (short) ((s << SIGN_SHIFT) | (outE << EXPONENT_SHIFT) + outM);
+  }
+
+  /**
+   * Returns a string representation of the specified half-precision
+   * float value. Calling this method is equivalent to calling
+   * <code>Float.toString(toFloat(h))</code>. See {@link Float#toString(float)}
+   * for more information on the format of the string representation.
+   *
+   * @param h A half-precision float value in binary little-endian format
+   * @return A string representation of the specified value
+   */
+  static String toFloatString(Binary h) {
+    return Float.toString(Float16.toFloat(h));
+  }
+}