From 160d45c251b5041b9688ff68c3a7c5e091a50989 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Mon, 13 Nov 2023 22:21:12 -0900 Subject: [PATCH 01/23] MINOR: [Docs] Tweak text on docs index page (#38695) ### Rationale for this change These are just some minor style tweaks which made the text and buttons on these cards read more naturally to me. Feel free to ignore or take only some of the changes here. ### Are these changes tested? Yes, I confirmed the docs still build as expected locally. ### Are there any user-facing changes? New language in docs, see above. Authored-by: Bryce Mecum Signed-off-by: AlenkaF --- docs/source/index.rst | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index d01c74f9a482e..8407813bd7abb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -46,8 +46,8 @@ target environment.** :class-card: contrib-card :shadow: none - Read about the Apache Arrow format - specifications and Protocols. + Read about the Apache Arrow format and its related specifications and + protocols. +++ @@ -57,17 +57,15 @@ target environment.** :color: primary :expand: - To the Specifications + To Specifications .. grid-item-card:: Development :class-card: contrib-card :shadow: none - Find the documentation on the topic of - contributions, reviews, building of the libraries - from source, building of the documentation, - continuous integration, benchmarks and the - release process. + Find documentation on building the libraries from source, building the + documentation, contributing and code reviews, continuous integration, + benchmarking, and the release process. +++ @@ -77,7 +75,7 @@ target environment.** :color: primary :expand: - To the Development + To Development .. _toc.columnar: From a4080209a97a5d66accdeb71c5c1ffa982fed51e Mon Sep 17 00:00:00 2001 From: James Duong Date: Tue, 14 Nov 2023 05:10:36 -0800 Subject: [PATCH 02/23] GH-38662: [Java] Add comparators (#38669) ### Rationale for this change Add missing Default VectorValueComparators for some more types. ### What changes are included in this PR? Add comparators for: - FixedSizeBinaryVector - LargeListVector - FixedSizeListVector - NullVector ### Are these changes tested? Yes, unit tests added. ### Are there any user-facing changes? No * Closes: #38662 Authored-by: James Duong Signed-off-by: David Li --- .../sort/DefaultVectorComparators.java | 140 ++++++++++++++++-- .../sort/TestDefaultVectorComparator.java | 132 +++++++++++++++++ 2 files changed, 259 insertions(+), 13 deletions(-) diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/DefaultVectorComparators.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/DefaultVectorComparators.java index 4f9c8b7d71bab..588876aa99059 100644 --- a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/DefaultVectorComparators.java +++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/DefaultVectorComparators.java @@ -32,11 +32,13 @@ import org.apache.arrow.vector.Decimal256Vector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.IntervalDayVector; import org.apache.arrow.vector.IntervalMonthDayNanoVector; +import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeMilliVector; @@ -50,7 +52,9 @@ import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VariableWidthVector; -import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.RepeatedValueVector; +import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder; /** * Default comparator implementations for different types of vectors. @@ -111,13 +115,21 @@ public static VectorValueComparator createDefaultComp return (VectorValueComparator) new TimeSecComparator(); } else if (vector instanceof TimeStampVector) { return (VectorValueComparator) new TimeStampComparator(); + } else if (vector instanceof FixedSizeBinaryVector) { + return (VectorValueComparator) new FixedSizeBinaryComparator(); } } else if (vector instanceof VariableWidthVector) { return (VectorValueComparator) new VariableWidthComparator(); - } else if (vector instanceof BaseRepeatedValueVector) { + } else if (vector instanceof RepeatedValueVector) { VectorValueComparator innerComparator = - createDefaultComparator(((BaseRepeatedValueVector) vector).getDataVector()); + createDefaultComparator(((RepeatedValueVector) vector).getDataVector()); return new RepeatedValueComparator(innerComparator); + } else if (vector instanceof FixedSizeListVector) { + VectorValueComparator innerComparator = + createDefaultComparator(((FixedSizeListVector) vector).getDataVector()); + return new FixedSizeListComparator(innerComparator); + } else if (vector instanceof NullVector) { + return (VectorValueComparator) new NullComparator(); } throw new IllegalArgumentException("No default comparator for " + vector.getClass().getCanonicalName()); @@ -674,6 +686,61 @@ public VectorValueComparator createNew() { } } + /** + * Default comparator for {@link org.apache.arrow.vector.FixedSizeBinaryVector}. + * The comparison is in lexicographic order, with null comes first. + */ + public static class FixedSizeBinaryComparator extends VectorValueComparator { + + @Override + public int compare(int index1, int index2) { + NullableFixedSizeBinaryHolder holder1 = new NullableFixedSizeBinaryHolder(); + NullableFixedSizeBinaryHolder holder2 = new NullableFixedSizeBinaryHolder(); + vector1.get(index1, holder1); + vector2.get(index2, holder2); + + return ByteFunctionHelpers.compare( + holder1.buffer, 0, holder1.byteWidth, holder2.buffer, 0, holder2.byteWidth); + } + + @Override + public int compareNotNull(int index1, int index2) { + NullableFixedSizeBinaryHolder holder1 = new NullableFixedSizeBinaryHolder(); + NullableFixedSizeBinaryHolder holder2 = new NullableFixedSizeBinaryHolder(); + vector1.get(index1, holder1); + vector2.get(index2, holder2); + + return ByteFunctionHelpers.compare( + holder1.buffer, 0, holder1.byteWidth, holder2.buffer, 0, holder2.byteWidth); + } + + @Override + public VectorValueComparator createNew() { + return new FixedSizeBinaryComparator(); + } + } + + /** + * Default comparator for {@link org.apache.arrow.vector.NullVector}. + */ + public static class NullComparator extends VectorValueComparator { + @Override + public int compare(int index1, int index2) { + // Values are always equal (and are always null). + return 0; + } + + @Override + public int compareNotNull(int index1, int index2) { + throw new AssertionError("Cannot compare non-null values in a NullVector."); + } + + @Override + public VectorValueComparator createNew() { + return new NullComparator(); + } + } + /** * Default comparator for {@link org.apache.arrow.vector.VariableWidthVector}. * The comparison is in lexicographic order, with null comes first. @@ -705,14 +772,14 @@ public VectorValueComparator createNew() { } /** - * Default comparator for {@link BaseRepeatedValueVector}. + * Default comparator for {@link RepeatedValueVector}. * It works by comparing the underlying vector in a lexicographic order. * @param inner vector type. */ public static class RepeatedValueComparator - extends VectorValueComparator { + extends VectorValueComparator { - private VectorValueComparator innerComparator; + private final VectorValueComparator innerComparator; public RepeatedValueComparator(VectorValueComparator innerComparator) { this.innerComparator = innerComparator; @@ -720,16 +787,16 @@ public RepeatedValueComparator(VectorValueComparator innerComparator) { @Override public int compareNotNull(int index1, int index2) { - int startIdx1 = vector1.getOffsetBuffer().getInt(index1 * OFFSET_WIDTH); - int startIdx2 = vector2.getOffsetBuffer().getInt(index2 * OFFSET_WIDTH); + int startIdx1 = vector1.getOffsetBuffer().getInt((long) index1 * OFFSET_WIDTH); + int startIdx2 = vector2.getOffsetBuffer().getInt((long) index2 * OFFSET_WIDTH); - int endIdx1 = vector1.getOffsetBuffer().getInt((index1 + 1) * OFFSET_WIDTH); - int endIdx2 = vector2.getOffsetBuffer().getInt((index2 + 1) * OFFSET_WIDTH); + int endIdx1 = vector1.getOffsetBuffer().getInt((long) (index1 + 1) * OFFSET_WIDTH); + int endIdx2 = vector2.getOffsetBuffer().getInt((long) (index2 + 1) * OFFSET_WIDTH); int length1 = endIdx1 - startIdx1; int length2 = endIdx2 - startIdx2; - int length = length1 < length2 ? length1 : length2; + int length = Math.min(length1, length2); for (int i = 0; i < length; i++) { int result = innerComparator.compare(startIdx1 + i, startIdx2 + i); @@ -741,13 +808,60 @@ public int compareNotNull(int index1, int index2) { } @Override - public VectorValueComparator createNew() { + public VectorValueComparator createNew() { VectorValueComparator newInnerComparator = innerComparator.createNew(); return new RepeatedValueComparator<>(newInnerComparator); } @Override - public void attachVectors(BaseRepeatedValueVector vector1, BaseRepeatedValueVector vector2) { + public void attachVectors(RepeatedValueVector vector1, RepeatedValueVector vector2) { + this.vector1 = vector1; + this.vector2 = vector2; + + innerComparator.attachVectors((T) vector1.getDataVector(), (T) vector2.getDataVector()); + } + } + + /** + * Default comparator for {@link RepeatedValueVector}. + * It works by comparing the underlying vector in a lexicographic order. + * @param inner vector type. + */ + public static class FixedSizeListComparator + extends VectorValueComparator { + + private final VectorValueComparator innerComparator; + + public FixedSizeListComparator(VectorValueComparator innerComparator) { + this.innerComparator = innerComparator; + } + + @Override + public int compareNotNull(int index1, int index2) { + int length1 = vector1.getListSize(); + int length2 = vector2.getListSize(); + + int length = Math.min(length1, length2); + int startIdx1 = vector1.getElementStartIndex(index1); + int startIdx2 = vector2.getElementStartIndex(index2); + + for (int i = 0; i < length; i++) { + int result = innerComparator.compare(startIdx1 + i, startIdx2 + i); + if (result != 0) { + return result; + } + } + return length1 - length2; + } + + @Override + public VectorValueComparator createNew() { + VectorValueComparator newInnerComparator = innerComparator.createNew(); + return new FixedSizeListComparator<>(newInnerComparator); + } + + @Override + public void attachVectors(FixedSizeListVector vector1, FixedSizeListVector vector2) { this.vector1 = vector1; this.vector2 = vector2; diff --git a/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestDefaultVectorComparator.java b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestDefaultVectorComparator.java index bdae85110aa62..43c634b7647fb 100644 --- a/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestDefaultVectorComparator.java +++ b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestDefaultVectorComparator.java @@ -31,12 +31,14 @@ import org.apache.arrow.vector.Decimal256Vector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.IntervalDayVector; import org.apache.arrow.vector.LargeVarBinaryVector; import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeMilliVector; @@ -52,6 +54,8 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.testing.ValueVectorDataPopulator; import org.apache.arrow.vector.types.TimeUnit; @@ -158,6 +162,61 @@ public void testCopiedComparatorForLists() { } } + private FixedSizeListVector createFixedSizeListVector(int count) { + FixedSizeListVector listVector = FixedSizeListVector.empty("list vector", count, allocator); + Types.MinorType type = Types.MinorType.INT; + listVector.addOrGetVector(FieldType.nullable(type.getType())); + listVector.allocateNew(); + + IntVector dataVector = (IntVector) listVector.getDataVector(); + + for (int i = 0; i < count; i++) { + dataVector.set(i, i); + } + dataVector.setValueCount(count); + + listVector.setNotNull(0); + listVector.setValueCount(1); + + return listVector; + } + + @Test + public void testCompareFixedSizeLists() { + try (FixedSizeListVector listVector1 = createFixedSizeListVector(10); + FixedSizeListVector listVector2 = createFixedSizeListVector(11)) { + VectorValueComparator comparator = + DefaultVectorComparators.createDefaultComparator(listVector1); + comparator.attachVectors(listVector1, listVector2); + + // prefix is smaller + assertTrue(comparator.compare(0, 0) < 0); + } + + try (FixedSizeListVector listVector1 = createFixedSizeListVector(11); + FixedSizeListVector listVector2 = createFixedSizeListVector(11)) { + ((IntVector) listVector2.getDataVector()).set(10, 110); + + VectorValueComparator comparator = + DefaultVectorComparators.createDefaultComparator(listVector1); + comparator.attachVectors(listVector1, listVector2); + + // breaking tie by the last element + assertTrue(comparator.compare(0, 0) < 0); + } + + try (FixedSizeListVector listVector1 = createFixedSizeListVector(10); + FixedSizeListVector listVector2 = createFixedSizeListVector(10)) { + + VectorValueComparator comparator = + DefaultVectorComparators.createDefaultComparator(listVector1); + comparator.attachVectors(listVector1, listVector2); + + // list vector elements equal + assertTrue(comparator.compare(0, 0) == 0); + } + } + @Test public void testCompareUInt1() { try (UInt1Vector vec = new UInt1Vector("", allocator)) { @@ -845,6 +904,65 @@ public void testCompareTimeStamp() { } } + @Test + public void testCompareFixedSizeBinary() { + try (FixedSizeBinaryVector vector1 = new FixedSizeBinaryVector("test1", allocator, 2); + FixedSizeBinaryVector vector2 = new FixedSizeBinaryVector("test1", allocator, 3)) { + vector1.allocateNew(); + vector2.allocateNew(); + vector1.set(0, new byte[] {1, 1}); + vector2.set(0, new byte[] {1, 1, 0}); + VectorValueComparator comparator = + DefaultVectorComparators.createDefaultComparator(vector1); + comparator.attachVectors(vector1, vector2); + + // prefix is smaller + assertTrue(comparator.compare(0, 0) < 0); + } + + try (FixedSizeBinaryVector vector1 = new FixedSizeBinaryVector("test1", allocator, 3); + FixedSizeBinaryVector vector2 = new FixedSizeBinaryVector("test1", allocator, 3)) { + vector1.allocateNew(); + vector2.allocateNew(); + vector1.set(0, new byte[] {1, 1, 0}); + vector2.set(0, new byte[] {1, 1, 1}); + VectorValueComparator comparator = + DefaultVectorComparators.createDefaultComparator(vector1); + comparator.attachVectors(vector1, vector2); + + // breaking tie by the last element + assertTrue(comparator.compare(0, 0) < 0); + } + + try (FixedSizeBinaryVector vector1 = new FixedSizeBinaryVector("test1", allocator, 3); + FixedSizeBinaryVector vector2 = new FixedSizeBinaryVector("test1", allocator, 3)) { + vector1.allocateNew(); + vector2.allocateNew(); + vector1.set(0, new byte[] {1, 1, 1}); + vector2.set(0, new byte[] {1, 1, 1}); + VectorValueComparator comparator = + DefaultVectorComparators.createDefaultComparator(vector1); + comparator.attachVectors(vector1, vector2); + + // list vector elements equal + assertTrue(comparator.compare(0, 0) == 0); + } + } + + @Test + public void testCompareNull() { + try (NullVector vec = new NullVector("test", + FieldType.notNullable(new ArrowType.Int(32, false)))) { + vec.setValueCount(2); + + VectorValueComparator comparator = + DefaultVectorComparators.createDefaultComparator(vec); + comparator.attachVector(vec); + assertEquals(DefaultVectorComparators.NullComparator.class, comparator.getClass()); + assertEquals(0, comparator.compare(0, 1)); + } + } + @Test public void testCheckNullsOnCompareIsFalseForNonNullableVector() { try (IntVector vec = new IntVector("not nullable", @@ -937,4 +1055,18 @@ private static void verifyVariableWidthComparatorReturne VectorValueComparator comparator = DefaultVectorComparators.createDefaultComparator(vec); assertEquals(DefaultVectorComparators.VariableWidthComparator.class, comparator.getClass()); } + + @Test + public void testRepeatedDefaultComparators() { + final FieldType type = FieldType.nullable(Types.MinorType.INT.getType()); + try (final LargeListVector vector = new LargeListVector("list", allocator, type, null)) { + vector.addOrGetVector(FieldType.nullable(type.getType())); + verifyRepeatedComparatorReturned(vector); + } + } + + private static void verifyRepeatedComparatorReturned(V vec) { + VectorValueComparator comparator = DefaultVectorComparators.createDefaultComparator(vec); + assertEquals(DefaultVectorComparators.RepeatedValueComparator.class, comparator.getClass()); + } } From f3ec224ab6ace14f630509c79dfbba2ec32d881a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 14 Nov 2023 14:25:29 +0100 Subject: [PATCH 03/23] GH-38626: [Python] Fix segfault when PyArrow is imported at shutdown (#38637) ### Rationale for this change Some C++ destructors may be called after the Python interpreter has ceased to exist. If such a destructor tries to call back in the Python interpreter, for example by calling `Py_DECREF`, we get a crash. ### What changes are included in this PR? Protect `OwnedRef` and `OwneRefNoGIL` destructors against decref'ing a Python object after Python finalization. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #38626 Authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- python/pyarrow/src/arrow/python/common.h | 17 ++++++++++------- python/pyarrow/tests/test_misc.py | 13 +++++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/src/arrow/python/common.h b/python/pyarrow/src/arrow/python/common.h index e36c0834fd424..bc567ef78e83a 100644 --- a/python/pyarrow/src/arrow/python/common.h +++ b/python/pyarrow/src/arrow/python/common.h @@ -188,7 +188,12 @@ class ARROW_PYTHON_EXPORT OwnedRef { return *this; } - ~OwnedRef() { reset(); } + ~OwnedRef() { + // GH-38626: destructor may be called after the Python interpreter is finalized. + if (Py_IsInitialized()) { + reset(); + } + } void reset(PyObject* obj) { Py_XDECREF(obj_); @@ -225,13 +230,11 @@ class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef { explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {} ~OwnedRefNoGIL() { - // This destructor may be called after the Python interpreter is finalized. - // At least avoid spurious attempts to take the GIL when not necessary. - if (obj() == NULLPTR) { - return; + // GH-38626: destructor may be called after the Python interpreter is finalized. + if (Py_IsInitialized() && obj() != NULLPTR) { + PyAcquireGIL lock; + reset(); } - PyAcquireGIL lock; - reset(); } }; diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 9b9dfdd554806..a48ac0c3cd81a 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -117,6 +117,19 @@ def test_runtime_info(): subprocess.check_call([sys.executable, "-c", code], env=env) +def test_import_at_shutdown(): + # GH-38626: importing PyArrow at interpreter shutdown would crash + code = """if 1: + import atexit + + def import_arrow(): + import pyarrow + + atexit.register(import_arrow) + """ + subprocess.check_call([sys.executable, "-c", code]) + + @pytest.mark.skipif(sys.platform == "win32", reason="Path to timezone database is not configurable " "on non-Windows platforms") From bb7ffaf0bd0661baba872c3fe1500369f26241bd Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 14 Nov 2023 10:43:28 -0500 Subject: [PATCH 04/23] GH-38477: [Go] Fixing decimal 128 rounding issue (#38478) ### Rationale for this change Fixing an off-by-one rounding issue with decimal128 by ensuring proper precision handling. ### Are these changes tested? The test case which reproduced the rounding issue has been added as a unit test. * Closes: #38477 Authored-by: Matt Topol Signed-off-by: Matt Topol --- go/arrow/decimal128/decimal128.go | 4 ++-- go/arrow/decimal128/decimal128_test.go | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/go/arrow/decimal128/decimal128.go b/go/arrow/decimal128/decimal128.go index 7bde39d327417..3b88dce1fa809 100644 --- a/go/arrow/decimal128/decimal128.go +++ b/go/arrow/decimal128/decimal128.go @@ -261,7 +261,7 @@ func FromString(v string, prec, scale int32) (n Num, err error) { var precInBits = uint(math.Round(float64(prec+scale+1)/math.Log10(2))) + 1 var out *big.Float - out, _, err = big.ParseFloat(v, 10, 127, big.ToNearestEven) + out, _, err = big.ParseFloat(v, 10, 128, big.ToNearestEven) if err != nil { return } @@ -280,7 +280,7 @@ func FromString(v string, prec, scale int32) (n Num, err error) { // (e.g. C++) handles Decimal values. So if we're negative we'll subtract 0.5 and if // we're positive we'll add 0.5. p := (&big.Float{}).SetInt(scaleMultipliers[scale].BigInt()) - out.Mul(out, p).SetPrec(precInBits) + out.SetPrec(precInBits).Mul(out, p) if out.Signbit() { out.Sub(out, pt5) } else { diff --git a/go/arrow/decimal128/decimal128_test.go b/go/arrow/decimal128/decimal128_test.go index 879f2849028f8..4cfd7db20db08 100644 --- a/go/arrow/decimal128/decimal128_test.go +++ b/go/arrow/decimal128/decimal128_test.go @@ -24,6 +24,7 @@ import ( "github.com/apache/arrow/go/v15/arrow/decimal128" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestFromU64(t *testing.T) { @@ -698,3 +699,11 @@ func TestBitLen(t *testing.T) { _, err = decimal128.FromString(b.String(), decimal128.MaxPrecision, -1) assert.ErrorContains(t, err, "bitlen too large for decimal128") } + +func TestFromStringDecimal128b(t *testing.T) { + const decStr = "9323406071781562130.6457232358109488923" + + num, err := decimal128.FromString(decStr, 38, 19) + require.NoError(t, err) + assert.Equal(t, decStr, num.ToString(19)) +} From cd0d7f53b3ab7dfac7a3477751a87586d4da3782 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 14 Nov 2023 17:31:08 +0100 Subject: [PATCH 05/23] MINOR: [Python] Fix name of new keyword in the concat_tables future warning (#38710) We renamed the new keyword in a final iteration of the PR, but apparently forgot to update the warning message. Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/table.pxi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index bbf60416de995..e55a0d1dd54cb 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -5226,7 +5226,8 @@ def concat_tables(tables, MemoryPool memory_pool=None, str promote_options="none if "promote" in kwargs: warnings.warn( - "promote has been superseded by mode='default'.", FutureWarning, stacklevel=2) + "promote has been superseded by promote_options='default'.", + FutureWarning, stacklevel=2) if kwargs['promote'] is True: promote_options = "default" From 26149d9fab0360e6d4d9a295f934100470c4bc37 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 14 Nov 2023 11:44:19 -0500 Subject: [PATCH 06/23] GH-38718: [Go][Format][Integration] Add StringView/BinaryView to Go implementation (#35769) ### Rationale for this change See #35628 for the rationale and description of the StringView/BinaryView array types. This change is adding Go as a second implementation of it. ### What changes are included in this PR? Add Array Types for `StringView` and `BinaryView` along with `StringViewType` and `BinaryViewType` and necessary enums and builders. These arrays can be round tripped through JSON and IPC. ### Are these changes tested? Yes, unit tests have been added and integration tests run * Closes: [#38718](https://github.com/apache/arrow/issues/38718) * Closes: #38718 Lead-authored-by: Matt Topol Co-authored-by: Alex Shcherbakov Signed-off-by: Benjamin Kietzman --- .gitattributes | 3 + docs/source/status.rst | 4 + format/Schema.fbs | 2 +- go/arrow/array/array.go | 3 +- go/arrow/array/binary.go | 121 +++++++ go/arrow/array/binary_test.go | 24 ++ go/arrow/array/binarybuilder.go | 329 ++++++++++++++++++ go/arrow/array/bufferbuilder.go | 108 ++++++ go/arrow/array/builder.go | 4 + go/arrow/array/compare.go | 12 + go/arrow/array/concat.go | 30 +- go/arrow/array/concat_test.go | 3 + go/arrow/array/string.go | 196 ++++++++++- go/arrow/array/string_test.go | 173 +++++++++ go/arrow/compute/executor.go | 5 +- go/arrow/datatype.go | 7 + go/arrow/datatype_binary.go | 41 +++ go/arrow/datatype_binary_test.go | 30 ++ go/arrow/datatype_viewheader.go | 141 ++++++++ go/arrow/datatype_viewheader_inline.go | 31 ++ go/arrow/datatype_viewheader_inline_go1.19.go | 35 ++ go/arrow/datatype_viewheader_inline_tinygo.go | 35 ++ go/arrow/internal/arrdata/arrdata.go | 81 +++++ go/arrow/internal/arrjson/arrjson.go | 150 ++++++++ go/arrow/internal/arrjson/arrjson_test.go | 259 ++++++++++++++ go/arrow/internal/flatbuf/MetadataVersion.go | 2 +- .../internal/testing/gen/random_array_gen.go | 34 ++ go/arrow/ipc/endian_swap.go | 4 + go/arrow/ipc/file_reader.go | 38 +- go/arrow/ipc/message.go | 10 +- go/arrow/ipc/metadata.go | 40 ++- go/arrow/ipc/writer.go | 36 +- go/arrow/type_traits_view.go | 53 +++ 33 files changed, 2011 insertions(+), 33 deletions(-) create mode 100644 go/arrow/datatype_viewheader.go create mode 100644 go/arrow/datatype_viewheader_inline.go create mode 100644 go/arrow/datatype_viewheader_inline_go1.19.go create mode 100644 go/arrow/datatype_viewheader_inline_tinygo.go create mode 100644 go/arrow/type_traits_view.go diff --git a/.gitattributes b/.gitattributes index 69f4139c4e4f4..70007c26c8b9b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3,6 +3,9 @@ cpp/src/generated/*.cpp linguist-generated=true cpp/src/generated/*.h linguist-generated=true go/**/*.s linguist-generated=true go/arrow/unionmode_string.go linguist-generated=true +go/arrow/internal/flatbuf/*.go linguist-generated=true +go/**/*.pb.go linguist-generated=true +go/parquet/internal/gen-go/parquet/*.go linguist-generated=true r/R/RcppExports.R linguist-generated=true r/R/arrowExports.R linguist-generated=true r/src/RcppExports.cpp linguist-generated=true diff --git a/docs/source/status.rst b/docs/source/status.rst index c8c0e6dfc1dfe..c059ab3cef971 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -68,6 +68,10 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large Utf8 | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Binary View | ✓ | | ✓ | | | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| String View | ✓ | | ✓ | | | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift | diff --git a/format/Schema.fbs b/format/Schema.fbs index 6adbcb115cde3..dbf482e6cc786 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -40,7 +40,7 @@ enum MetadataVersion:short { /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. V4, - /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 /// metadata and IPC messages). Implementations are recommended to provide a /// V4 compatibility mode with V5 format changes disabled. /// diff --git a/go/arrow/array/array.go b/go/arrow/array/array.go index bbe301ee661f3..5aacc8f99a4ee 100644 --- a/go/arrow/array/array.go +++ b/go/arrow/array/array.go @@ -178,7 +178,8 @@ func init() { arrow.RUN_END_ENCODED: func(data arrow.ArrayData) arrow.Array { return NewRunEndEncodedData(data) }, arrow.LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewListViewData(data) }, arrow.LARGE_LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewLargeListViewData(data) }, - + arrow.BINARY_VIEW: func(data arrow.ArrayData) arrow.Array { return NewBinaryViewData(data) }, + arrow.STRING_VIEW: func(data arrow.ArrayData) arrow.Array { return NewStringViewData(data) }, // invalid data types to fill out array to size 2^6 - 1 63: invalidDataType, } diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index bf27139fddbaa..c226297da04c6 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -24,6 +24,7 @@ import ( "unsafe" "github.com/apache/arrow/go/v15/arrow" + "github.com/apache/arrow/go/v15/arrow/memory" "github.com/apache/arrow/go/v15/internal/json" ) @@ -318,6 +319,126 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool { return true } +type ViewLike interface { + arrow.Array + ValueHeader(int) *arrow.ViewHeader +} + +type BinaryView struct { + array + values []arrow.ViewHeader + dataBuffers []*memory.Buffer +} + +func NewBinaryViewData(data arrow.ArrayData) *BinaryView { + a := &BinaryView{} + a.refCount = 1 + a.setData(data.(*Data)) + return a +} + +func (a *BinaryView) setData(data *Data) { + if len(data.buffers) < 2 { + panic("len(data.buffers) < 2") + } + a.array.setData(data) + + if valueData := data.buffers[1]; valueData != nil { + a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes()) + } + + a.dataBuffers = data.buffers[2:] +} + +func (a *BinaryView) ValueHeader(i int) *arrow.ViewHeader { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + return &a.values[a.array.data.offset+i] +} + +func (a *BinaryView) Value(i int) []byte { + s := a.ValueHeader(i) + if s.IsInline() { + return s.InlineBytes() + } + start := s.BufferOffset() + buf := a.dataBuffers[s.BufferIndex()] + return buf.Bytes()[start : start+int32(s.Len())] +} + +// ValueString returns the value at index i as a string instead of +// a byte slice, without copying the underlying data. +func (a *BinaryView) ValueString(i int) string { + b := a.Value(i) + return *(*string)(unsafe.Pointer(&b)) +} + +func (a *BinaryView) String() string { + var o strings.Builder + o.WriteString("[") + for i := 0; i < a.Len(); i++ { + if i > 0 { + o.WriteString(" ") + } + switch { + case a.IsNull(i): + o.WriteString(NullValueStr) + default: + fmt.Fprintf(&o, "%q", a.ValueString(i)) + } + } + o.WriteString("]") + return o.String() +} + +// ValueStr is paired with AppendValueFromString in that it returns +// the value at index i as a string: Semantically this means that for +// a null value it will return the string "(null)", otherwise it will +// return the value as a base64 encoded string suitable for CSV/JSON. +// +// This is always going to be less performant than just using ValueString +// and exists to fulfill the Array interface to provide a method which +// can produce a human readable string for a given index. +func (a *BinaryView) ValueStr(i int) string { + if a.IsNull(i) { + return NullValueStr + } + return base64.StdEncoding.EncodeToString(a.Value(i)) +} + +func (a *BinaryView) GetOneForMarshal(i int) interface{} { + if a.IsNull(i) { + return nil + } + return a.Value(i) +} + +func (a *BinaryView) MarshalJSON() ([]byte, error) { + vals := make([]interface{}, a.Len()) + for i := 0; i < a.Len(); i++ { + vals[i] = a.GetOneForMarshal(i) + } + // golang marshal standard says that []byte will be marshalled + // as a base64-encoded string + return json.Marshal(vals) +} + +func arrayEqualBinaryView(left, right *BinaryView) bool { + leftBufs, rightBufs := left.dataBuffers, right.dataBuffers + for i := 0; i < left.Len(); i++ { + if left.IsNull(i) { + continue + } + if !left.ValueHeader(i).Equals(leftBufs, right.ValueHeader(i), rightBufs) { + return false + } + } + return true +} + var ( _ arrow.Array = (*Binary)(nil) + _ arrow.Array = (*LargeBinary)(nil) + _ arrow.Array = (*BinaryView)(nil) ) diff --git a/go/arrow/array/binary_test.go b/go/arrow/array/binary_test.go index 9c1770950a8b5..c9e165515225b 100644 --- a/go/arrow/array/binary_test.go +++ b/go/arrow/array/binary_test.go @@ -700,3 +700,27 @@ func TestBinaryStringRoundTrip(t *testing.T) { assert.True(t, Equal(arr, arr1)) } + +func TestBinaryViewStringRoundTrip(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "supercalifragilistic", "", "expeallodocious"} + valid := []bool{true, true, false, false, true, true, true} + + b := NewBinaryViewBuilder(mem) + defer b.Release() + + b.AppendStringValues(values, valid) + arr := b.NewArray().(*BinaryView) + defer arr.Release() + + for i := 0; i < arr.Len(); i++ { + assert.NoError(t, b.AppendValueFromString(arr.ValueStr(i))) + } + + arr1 := b.NewArray().(*BinaryView) + defer arr1.Release() + + assert.True(t, Equal(arr, arr1)) +} diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index a51bc799e4965..21ad576508e9e 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -23,6 +23,7 @@ import ( "math" "reflect" "sync/atomic" + "unsafe" "github.com/apache/arrow/go/v15/arrow" "github.com/apache/arrow/go/v15/arrow/internal/debug" @@ -370,6 +371,334 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error { return b.Unmarshal(dec) } +const ( + dfltBlockSize = 32 << 10 // 32 KB + viewValueSizeLimit int32 = math.MaxInt32 +) + +type BinaryViewBuilder struct { + builder + dtype arrow.BinaryDataType + + data *memory.Buffer + rawData []arrow.ViewHeader + + blockBuilder multiBufferBuilder +} + +func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder { + return &BinaryViewBuilder{ + dtype: arrow.BinaryTypes.BinaryView, + builder: builder{ + refCount: 1, + mem: mem, + }, + blockBuilder: multiBufferBuilder{ + refCount: 1, + blockSize: dfltBlockSize, + mem: mem, + }, + } +} + +func (b *BinaryViewBuilder) SetBlockSize(sz uint) { + b.blockBuilder.blockSize = int(sz) +} + +func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype } + +func (b *BinaryViewBuilder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) != 0 { + return + } + + if b.nullBitmap != nil { + b.nullBitmap.Release() + b.nullBitmap = nil + } + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } +} + +func (b *BinaryViewBuilder) init(capacity int) { + b.builder.init(capacity) + b.data = memory.NewResizableBuffer(b.mem) + bytesN := arrow.ViewHeaderTraits.BytesRequired(capacity) + b.data.Resize(bytesN) + b.rawData = arrow.ViewHeaderTraits.CastFromBytes(b.data.Bytes()) +} + +func (b *BinaryViewBuilder) Resize(n int) { + nbuild := n + if n < minBuilderCapacity { + n = minBuilderCapacity + } + + if b.capacity == 0 { + b.init(n) + return + } + + b.builder.resize(nbuild, b.init) + b.data.Resize(arrow.ViewHeaderTraits.BytesRequired(n)) + b.rawData = arrow.ViewHeaderTraits.CastFromBytes(b.data.Bytes()) +} + +func (b *BinaryViewBuilder) ReserveData(length int) { + if int32(length) > viewValueSizeLimit { + panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 2GB", + arrow.ErrInvalid)) + } + b.blockBuilder.Reserve(int(length)) +} + +func (b *BinaryViewBuilder) Reserve(n int) { + b.builder.reserve(n, b.Resize) +} + +func (b *BinaryViewBuilder) Append(v []byte) { + if int32(len(v)) > viewValueSizeLimit { + panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 2GB", arrow.ErrInvalid)) + } + + if !arrow.IsViewInline(len(v)) { + b.ReserveData(len(v)) + } + + b.Reserve(1) + b.UnsafeAppend(v) +} + +// AppendString is identical to Append, only accepting a string instead +// of a byte slice, avoiding the extra copy that would occur if you simply +// did []byte(v). +// +// This is different than AppendValueFromString which exists for the +// Builder interface, in that this expects raw binary data which is +// appended unmodified. AppendValueFromString expects base64 encoded binary +// data instead. +func (b *BinaryViewBuilder) AppendString(v string) { + // create a []byte without copying the bytes + // in go1.20 this would be unsafe.StringData + val := *(*[]byte)(unsafe.Pointer(&struct { + string + int + }{v, len(v)})) + b.Append(val) +} + +func (b *BinaryViewBuilder) AppendNull() { + b.Reserve(1) + b.UnsafeAppendBoolToBitmap(false) +} + +func (b *BinaryViewBuilder) AppendNulls(n int) { + b.Reserve(n) + for i := 0; i < n; i++ { + b.UnsafeAppendBoolToBitmap(false) + } +} + +func (b *BinaryViewBuilder) AppendEmptyValue() { + b.Reserve(1) + b.UnsafeAppendBoolToBitmap(true) +} + +func (b *BinaryViewBuilder) AppendEmptyValues(n int) { + b.Reserve(n) + b.unsafeAppendBoolsToBitmap(nil, n) +} + +func (b *BinaryViewBuilder) UnsafeAppend(v []byte) { + hdr := &b.rawData[b.length] + hdr.SetBytes(v) + if !hdr.IsInline() { + b.blockBuilder.UnsafeAppend(hdr, v) + } + b.UnsafeAppendBoolToBitmap(true) +} + +func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) { + if len(v) != len(valid) && len(valid) != 0 { + panic("len(v) != len(valid) && len(valid) != 0") + } + + if len(v) == 0 { + return + } + + b.Reserve(len(v)) + outOfLineTotal := 0 + for i, vv := range v { + if len(valid) == 0 || valid[i] { + if !arrow.IsViewInline(len(vv)) { + outOfLineTotal += len(vv) + } + } + } + + b.ReserveData(outOfLineTotal) + for i, vv := range v { + if len(valid) == 0 || valid[i] { + hdr := &b.rawData[b.length+i] + hdr.SetBytes(vv) + if !hdr.IsInline() { + b.blockBuilder.UnsafeAppend(hdr, vv) + } + } + } + + b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) +} + +func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) { + if len(v) != len(valid) && len(valid) != 0 { + panic("len(v) != len(valid) && len(valid) != 0") + } + + if len(v) == 0 { + return + } + + b.Reserve(len(v)) + outOfLineTotal := 0 + for i, vv := range v { + if len(valid) == 0 || valid[i] { + if !arrow.IsViewInline(len(vv)) { + outOfLineTotal += len(vv) + } + } + } + + b.ReserveData(outOfLineTotal) + for i, vv := range v { + if len(valid) == 0 || valid[i] { + hdr := &b.rawData[b.length+i] + hdr.SetString(vv) + if !hdr.IsInline() { + b.blockBuilder.UnsafeAppendString(hdr, vv) + } + } + } + + b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) +} + +// AppendValueFromString is paired with ValueStr for fulfilling the +// base Builder interface. This is intended to read in a human-readable +// string such as from CSV or JSON and append it to the array. +// +// For Binary values are expected to be base64 encoded (and will be +// decoded as such before being appended). +func (b *BinaryViewBuilder) AppendValueFromString(s string) error { + if s == NullValueStr { + b.AppendNull() + return nil + } + + if b.dtype.IsUtf8() { + b.Append([]byte(s)) + return nil + } + + decodedVal, err := base64.StdEncoding.DecodeString(s) + if err != nil { + return fmt.Errorf("could not decode base64 string: %w", err) + } + b.Append(decodedVal) + return nil +} + +func (b *BinaryViewBuilder) UnmarshalOne(dec *json.Decoder) error { + t, err := dec.Token() + if err != nil { + return err + } + + switch v := t.(type) { + case string: + data, err := base64.StdEncoding.DecodeString(v) + if err != nil { + return err + } + b.Append(data) + case []byte: + b.Append(v) + case nil: + b.AppendNull() + default: + return &json.UnmarshalTypeError{ + Value: fmt.Sprint(t), + Type: reflect.TypeOf([]byte{}), + Offset: dec.InputOffset(), + } + } + return nil +} + +func (b *BinaryViewBuilder) Unmarshal(dec *json.Decoder) error { + for dec.More() { + if err := b.UnmarshalOne(dec); err != nil { + return err + } + } + return nil +} + +func (b *BinaryViewBuilder) UnmarshalJSON(data []byte) error { + dec := json.NewDecoder(bytes.NewReader(data)) + t, err := dec.Token() + if err != nil { + return err + } + + if delim, ok := t.(json.Delim); !ok || delim != '[' { + return fmt.Errorf("binary view builder must unpack from json array, found %s", delim) + } + + return b.Unmarshal(dec) +} + +func (b *BinaryViewBuilder) newData() (data *Data) { + bytesRequired := arrow.ViewHeaderTraits.BytesRequired(b.length) + if bytesRequired > 0 && bytesRequired < b.data.Len() { + // trim buffers + b.data.Resize(bytesRequired) + } + + dataBuffers := b.blockBuilder.Finish() + data = NewData(b.dtype, b.length, append([]*memory.Buffer{ + b.nullBitmap, b.data}, dataBuffers...), nil, b.nulls, 0) + b.reset() + + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + for _, buf := range dataBuffers { + buf.Release() + } + } + return +} + +func (b *BinaryViewBuilder) NewBinaryViewArray() (a *BinaryView) { + data := b.newData() + a = NewBinaryViewData(data) + data.Release() + return +} + +func (b *BinaryViewBuilder) NewArray() arrow.Array { + return b.NewBinaryViewArray() +} + var ( _ Builder = (*BinaryBuilder)(nil) + _ Builder = (*BinaryViewBuilder)(nil) ) diff --git a/go/arrow/array/bufferbuilder.go b/go/arrow/array/bufferbuilder.go index cb381e25b32a2..13741ba8926ac 100644 --- a/go/arrow/array/bufferbuilder.go +++ b/go/arrow/array/bufferbuilder.go @@ -18,7 +18,9 @@ package array import ( "sync/atomic" + "unsafe" + "github.com/apache/arrow/go/v15/arrow" "github.com/apache/arrow/go/v15/arrow/bitutil" "github.com/apache/arrow/go/v15/arrow/internal/debug" "github.com/apache/arrow/go/v15/arrow/memory" @@ -151,3 +153,109 @@ func (b *bufferBuilder) unsafeAppend(data []byte) { copy(b.bytes[b.length:], data) b.length += len(data) } + +type multiBufferBuilder struct { + refCount int64 + blockSize int + + mem memory.Allocator + blocks []*memory.Buffer + currentOutBuffer int +} + +// Retain increases the reference count by 1. +// Retain may be called simultaneously from multiple goroutines. +func (b *multiBufferBuilder) Retain() { + atomic.AddInt64(&b.refCount, 1) +} + +// Release decreases the reference count by 1. +// When the reference count goes to zero, the memory is freed. +// Release may be called simultaneously from multiple goroutines. +func (b *multiBufferBuilder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) == 0 { + b.Reset() + } +} + +func (b *multiBufferBuilder) Reserve(nbytes int) { + if len(b.blocks) == 0 { + out := memory.NewResizableBuffer(b.mem) + if nbytes < b.blockSize { + nbytes = b.blockSize + } + out.Reserve(nbytes) + b.currentOutBuffer = 0 + b.blocks = []*memory.Buffer{out} + return + } + + curBuf := b.blocks[b.currentOutBuffer] + remain := curBuf.Cap() - curBuf.Len() + if nbytes <= remain { + return + } + + // search for underfull block that has enough bytes + for i, block := range b.blocks { + remaining := block.Cap() - block.Len() + if nbytes <= remaining { + b.currentOutBuffer = i + return + } + } + + // current buffer doesn't have enough space, no underfull buffers + // make new buffer and set that as our current. + newBuf := memory.NewResizableBuffer(b.mem) + if nbytes < b.blockSize { + nbytes = b.blockSize + } + + newBuf.Reserve(nbytes) + b.currentOutBuffer = len(b.blocks) + b.blocks = append(b.blocks, newBuf) +} + +func (b *multiBufferBuilder) RemainingBytes() int { + if len(b.blocks) == 0 { + return 0 + } + + buf := b.blocks[b.currentOutBuffer] + return buf.Cap() - buf.Len() +} + +func (b *multiBufferBuilder) Reset() { + b.currentOutBuffer = 0 + for _, block := range b.Finish() { + block.Release() + } +} + +func (b *multiBufferBuilder) UnsafeAppend(hdr *arrow.ViewHeader, val []byte) { + buf := b.blocks[b.currentOutBuffer] + idx, offset := b.currentOutBuffer, buf.Len() + hdr.SetIndexOffset(int32(idx), int32(offset)) + + n := copy(buf.Buf()[offset:], val) + buf.ResizeNoShrink(offset + n) +} + +func (b *multiBufferBuilder) UnsafeAppendString(hdr *arrow.ViewHeader, val string) { + // create a byte slice with zero-copies + // in go1.20 this would be equivalent to unsafe.StringData + v := *(*[]byte)(unsafe.Pointer(&struct { + string + int + }{val, len(val)})) + b.UnsafeAppend(hdr, v) +} + +func (b *multiBufferBuilder) Finish() (out []*memory.Buffer) { + b.currentOutBuffer = 0 + out, b.blocks = b.blocks, nil + return +} diff --git a/go/arrow/array/builder.go b/go/arrow/array/builder.go index bb15298e03ccf..279804a1cdb9f 100644 --- a/go/arrow/array/builder.go +++ b/go/arrow/array/builder.go @@ -364,6 +364,10 @@ func NewBuilder(mem memory.Allocator, dtype arrow.DataType) Builder { case arrow.RUN_END_ENCODED: typ := dtype.(*arrow.RunEndEncodedType) return NewRunEndEncodedBuilder(mem, typ.RunEnds(), typ.Encoded()) + case arrow.BINARY_VIEW: + return NewBinaryViewBuilder(mem) + case arrow.STRING_VIEW: + return NewStringViewBuilder(mem) } panic(fmt.Errorf("arrow/array: unsupported builder for %T", dtype)) } diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go index 778de41e32c67..372293a61d6cb 100644 --- a/go/arrow/array/compare.go +++ b/go/arrow/array/compare.go @@ -232,6 +232,12 @@ func Equal(left, right arrow.Array) bool { case *LargeString: r := right.(*LargeString) return arrayEqualLargeString(l, r) + case *BinaryView: + r := right.(*BinaryView) + return arrayEqualBinaryView(l, r) + case *StringView: + r := right.(*StringView) + return arrayEqualStringView(l, r) case *Int8: r := right.(*Int8) return arrayEqualInt8(l, r) @@ -482,6 +488,12 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { case *LargeString: r := right.(*LargeString) return arrayEqualLargeString(l, r) + case *BinaryView: + r := right.(*BinaryView) + return arrayEqualBinaryView(l, r) + case *StringView: + r := right.(*StringView) + return arrayEqualStringView(l, r) case *Int8: r := right.(*Int8) return arrayEqualInt8(l, r) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index 53c5be06895b9..fa3554c1c0555 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -600,6 +600,35 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, } case arrow.FixedWidthDataType: out.buffers[1] = concatBuffers(gatherBuffersFixedWidthType(data, 1, dt), mem) + case arrow.BinaryViewDataType: + out.buffers = out.buffers[:2] + for _, d := range data { + for _, buf := range d.Buffers()[2:] { + buf.Retain() + out.buffers = append(out.buffers, buf) + } + } + + out.buffers[1] = concatBuffers(gatherFixedBuffers(data, 1, arrow.ViewHeaderSizeBytes), mem) + + var ( + s = arrow.ViewHeaderTraits.CastFromBytes(out.buffers[1].Bytes()) + i = data[0].Len() + precedingBufsCount int + ) + + for idx := 1; idx < len(data); idx++ { + precedingBufsCount += len(data[idx-1].Buffers()) - 2 + + for end := i + data[idx].Len(); i < end; i++ { + if s[i].IsInline() { + continue + } + + bufIndex := s[i].BufferIndex() + int32(precedingBufsCount) + s[i].SetIndexOffset(bufIndex, s[i].BufferOffset()) + } + } case arrow.BinaryDataType: offsetWidth := dt.Layout().Buffers[1].ByteWidth offsetBuffer, valueRanges, err := concatOffsets(gatherFixedBuffers(data, 1, offsetWidth), offsetWidth, mem) @@ -739,7 +768,6 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, out.childData[0].Release() return nil, err } - default: return nil, fmt.Errorf("concatenate not implemented for type %s", dt) } diff --git a/go/arrow/array/concat_test.go b/go/arrow/array/concat_test.go index 1cc484ad1a923..7b22d97a41e00 100644 --- a/go/arrow/array/concat_test.go +++ b/go/arrow/array/concat_test.go @@ -84,6 +84,7 @@ func TestConcatenate(t *testing.T) { {arrow.StructOf()}, {arrow.MapOf(arrow.PrimitiveTypes.Uint16, arrow.PrimitiveTypes.Int8)}, {&arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: arrow.PrimitiveTypes.Float64}}, + {arrow.BinaryTypes.StringView}, } for _, tt := range tests { @@ -150,6 +151,8 @@ func (cts *ConcatTestSuite) generateArr(size int64, nullprob float64) arrow.Arra return cts.rng.String(size, 0, 15, nullprob) case arrow.LARGE_STRING: return cts.rng.LargeString(size, 0, 15, nullprob) + case arrow.STRING_VIEW: + return cts.rng.StringView(size, 0, 20, nullprob) case arrow.LIST: valuesSize := size * 4 values := cts.rng.Int8(valuesSize, 0, 127, nullprob).(*array.Int8) diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go index 9ab7c938ef5d8..90a4628f0d0fb 100644 --- a/go/arrow/array/string.go +++ b/go/arrow/array/string.go @@ -28,6 +28,11 @@ import ( "github.com/apache/arrow/go/v15/internal/json" ) +type StringLike interface { + arrow.Array + Value(int) string +} + // String represents an immutable sequence of variable-length UTF-8 strings. type String struct { array @@ -310,6 +315,108 @@ func arrayEqualLargeString(left, right *LargeString) bool { return true } +type StringView struct { + array + values []arrow.ViewHeader + dataBuffers []*memory.Buffer +} + +func NewStringViewData(data arrow.ArrayData) *StringView { + a := &StringView{} + a.refCount = 1 + a.setData(data.(*Data)) + return a +} + +// Reset resets the String with a different set of Data. +func (a *StringView) Reset(data arrow.ArrayData) { + a.setData(data.(*Data)) +} + +func (a *StringView) setData(data *Data) { + if len(data.buffers) < 2 { + panic("len(data.buffers) < 2") + } + a.array.setData(data) + + if valueData := data.buffers[1]; valueData != nil { + a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes()) + } + + a.dataBuffers = data.buffers[2:] +} + +func (a *StringView) ValueHeader(i int) *arrow.ViewHeader { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + return &a.values[a.array.data.offset+i] +} + +func (a *StringView) Value(i int) string { + s := a.ValueHeader(i) + if s.IsInline() { + return s.InlineString() + } + start := s.BufferOffset() + buf := a.dataBuffers[s.BufferIndex()] + value := buf.Bytes()[start : start+int32(s.Len())] + return *(*string)(unsafe.Pointer(&value)) +} + +func (a *StringView) String() string { + var o strings.Builder + o.WriteString("[") + for i := 0; i < a.Len(); i++ { + if i > 0 { + o.WriteString(" ") + } + switch { + case a.IsNull(i): + o.WriteString(NullValueStr) + default: + fmt.Fprintf(&o, "%q", a.Value(i)) + } + } + o.WriteString("]") + return o.String() +} + +func (a *StringView) ValueStr(i int) string { + if a.IsNull(i) { + return NullValueStr + } + return a.Value(i) +} + +func (a *StringView) GetOneForMarshal(i int) interface{} { + if a.IsNull(i) { + return nil + } + return a.Value(i) +} + +func (a *StringView) MarshalJSON() ([]byte, error) { + vals := make([]interface{}, a.Len()) + for i := 0; i < a.Len(); i++ { + vals[i] = a.GetOneForMarshal(i) + } + return json.Marshal(vals) +} + +func arrayEqualStringView(left, right *StringView) bool { + leftBufs, rightBufs := left.dataBuffers, right.dataBuffers + for i := 0; i < left.Len(); i++ { + if left.IsNull(i) { + continue + } + if !left.ValueHeader(i).Equals(leftBufs, right.ValueHeader(i), rightBufs) { + return false + } + } + return true +} + // A StringBuilder is used to build a String array using the Append methods. type StringBuilder struct { *BinaryBuilder @@ -344,10 +451,6 @@ func (b *StringBuilder) Value(i int) string { return string(b.BinaryBuilder.Value(i)) } -// func (b *StringBuilder) UnsafeAppend(v string) { -// b.BinaryBuilder.UnsafeAppend([]byte(v)) -// } - // NewArray creates a String array from the memory buffers used by the builder and resets the StringBuilder // so it can be used to build a new array. func (b *StringBuilder) NewArray() arrow.Array { @@ -441,10 +544,6 @@ func (b *LargeStringBuilder) Value(i int) string { return string(b.BinaryBuilder.Value(i)) } -// func (b *LargeStringBuilder) UnsafeAppend(v string) { -// b.BinaryBuilder.UnsafeAppend([]byte(v)) -// } - // NewArray creates a String array from the memory buffers used by the builder and resets the StringBuilder // so it can be used to build a new array. func (b *LargeStringBuilder) NewArray() arrow.Array { @@ -504,9 +603,87 @@ func (b *LargeStringBuilder) UnmarshalJSON(data []byte) error { return b.Unmarshal(dec) } +type StringViewBuilder struct { + *BinaryViewBuilder +} + +func NewStringViewBuilder(mem memory.Allocator) *StringViewBuilder { + bldr := &StringViewBuilder{ + BinaryViewBuilder: NewBinaryViewBuilder(mem), + } + bldr.dtype = arrow.BinaryTypes.StringView + return bldr +} + +func (b *StringViewBuilder) Append(v string) { + b.BinaryViewBuilder.AppendString(v) +} + +func (b *StringViewBuilder) AppendValues(v []string, valid []bool) { + b.BinaryViewBuilder.AppendStringValues(v, valid) +} + +func (b *StringViewBuilder) UnmarshalOne(dec *json.Decoder) error { + t, err := dec.Token() + if err != nil { + return err + } + + switch v := t.(type) { + case string: + b.Append(v) + case []byte: + b.BinaryViewBuilder.Append(v) + case nil: + b.AppendNull() + default: + return &json.UnmarshalTypeError{ + Value: fmt.Sprint(t), + Type: reflect.TypeOf([]byte{}), + Offset: dec.InputOffset(), + } + } + return nil +} + +func (b *StringViewBuilder) Unmarshal(dec *json.Decoder) error { + for dec.More() { + if err := b.UnmarshalOne(dec); err != nil { + return err + } + } + return nil +} + +func (b *StringViewBuilder) UnmarshalJSON(data []byte) error { + dec := json.NewDecoder(bytes.NewReader(data)) + t, err := dec.Token() + if err != nil { + return err + } + + if delim, ok := t.(json.Delim); !ok || delim != '[' { + return fmt.Errorf("binary view builder must unpack from json array, found %s", delim) + } + + return b.Unmarshal(dec) +} + +func (b *StringViewBuilder) NewArray() arrow.Array { + return b.NewStringViewArray() +} + +func (b *StringViewBuilder) NewStringViewArray() (a *StringView) { + data := b.newData() + a = NewStringViewData(data) + data.Release() + return +} + type StringLikeBuilder interface { Builder Append(string) + AppendValues([]string, []bool) UnsafeAppend([]byte) ReserveData(int) } @@ -514,8 +691,11 @@ type StringLikeBuilder interface { var ( _ arrow.Array = (*String)(nil) _ arrow.Array = (*LargeString)(nil) + _ arrow.Array = (*StringView)(nil) _ Builder = (*StringBuilder)(nil) _ Builder = (*LargeStringBuilder)(nil) + _ Builder = (*StringViewBuilder)(nil) _ StringLikeBuilder = (*StringBuilder)(nil) _ StringLikeBuilder = (*LargeStringBuilder)(nil) + _ StringLikeBuilder = (*StringViewBuilder)(nil) ) diff --git a/go/arrow/array/string_test.go b/go/arrow/array/string_test.go index d743a3ec7f37f..803fae51347c1 100644 --- a/go/arrow/array/string_test.go +++ b/go/arrow/array/string_test.go @@ -619,3 +619,176 @@ func TestStringValueLen(t *testing.T) { assert.Equal(t, len(v), slice.ValueLen(i)) } } +func TestStringViewArray(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + var ( + // only the last string is long enough to not get inlined + want = []string{"hello", "世界", "", "say goodbye daffy"} + valids = []bool{true, true, false, true} + ) + + sb := array.NewStringViewBuilder(mem) + defer sb.Release() + + sb.Retain() + sb.Release() + + assert.NoError(t, sb.AppendValueFromString(want[0])) + sb.AppendValues(want[1:2], nil) + + sb.AppendNull() + sb.Append(want[3]) + + if got, want := sb.Len(), len(want); got != want { + t.Fatalf("invalid len: got=%d, want=%d", got, want) + } + + if got, want := sb.NullN(), 1; got != want { + t.Fatalf("invalid nulls: got=%d, want=%d", got, want) + } + + arr := sb.NewStringViewArray() + defer arr.Release() + + arr.Retain() + arr.Release() + + assert.Equal(t, "hello", arr.ValueStr(0)) + + if got, want := arr.Len(), len(want); got != want { + t.Fatalf("invalid len: got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 1; got != want { + t.Fatalf("invalid nulls: got=%d, want=%d", got, want) + } + + for i := range want { + if arr.IsNull(i) != !valids[i] { + t.Fatalf("arr[%d]-validity: got=%v want=%v", i, !arr.IsNull(i), valids[i]) + } + switch { + case arr.IsNull(i): + default: + got := arr.Value(i) + if got != want[i] { + t.Fatalf("arr[%d]: got=%q, want=%q", i, got, want[i]) + } + } + } + + sub := array.MakeFromData(arr.Data()) + defer sub.Release() + + if sub.DataType().ID() != arrow.STRING_VIEW { + t.Fatalf("invalid type: got=%q, want=string view", sub.DataType().Name()) + } + + if _, ok := sub.(*array.StringView); !ok { + t.Fatalf("could not type-assert to array.String") + } + + if got, want := arr.String(), `["hello" "世界" (null) "say goodbye daffy"]`; got != want { + t.Fatalf("got=%q, want=%q", got, want) + } + + // only the last string gets stuck into a buffer the rest are inlined + // in the headers. + if !bytes.Equal([]byte(`say goodbye daffy`), arr.Data().Buffers()[2].Bytes()) { + t.Fatalf("got=%q, want=%q", string(arr.Data().Buffers()[2].Bytes()), `say goodbye daffy`) + } + + // check the prefix for the non-inlined value + if [4]byte{'s', 'a', 'y', ' '} != arr.ValueHeader(3).Prefix() { + t.Fatalf("got=%q, want=%q", arr.ValueHeader(3).Prefix(), `say `) + } + + slice := array.NewSliceData(arr.Data(), 2, 4) + defer slice.Release() + + sub1 := array.MakeFromData(slice) + defer sub1.Release() + + v, ok := sub1.(*array.StringView) + if !ok { + t.Fatalf("could not type-assert to array.StringView") + } + + if got, want := v.String(), `[(null) "say goodbye daffy"]`; got != want { + t.Fatalf("got=%q, want=%q", got, want) + } + + if !bytes.Equal([]byte(`say goodbye daffy`), v.Data().Buffers()[2].Bytes()) { + t.Fatalf("got=%q, want=%q", string(v.Data().Buffers()[2].Bytes()), `say goodbye daffy`) + } + + // check the prefix for the non-inlined value + if [4]byte{'s', 'a', 'y', ' '} != v.ValueHeader(1).Prefix() { + t.Fatalf("got=%q, want=%q", v.ValueHeader(1).Prefix(), `say `) + } +} + +func TestStringViewBuilder_Empty(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + want := []string{"hello", "世界", "", "say goodbye daffy"} + + ab := array.NewStringViewBuilder(mem) + defer ab.Release() + + stringValues := func(a *array.StringView) []string { + vs := make([]string, a.Len()) + for i := range vs { + vs[i] = a.Value(i) + } + return vs + } + + ab.AppendValues([]string{}, nil) + a := ab.NewStringViewArray() + assert.Zero(t, a.Len()) + a.Release() + + ab.AppendValues(nil, nil) + a = ab.NewStringViewArray() + assert.Zero(t, a.Len()) + a.Release() + + ab.AppendValues([]string{}, nil) + ab.AppendValues(want, nil) + a = ab.NewStringViewArray() + assert.Equal(t, want, stringValues(a)) + a.Release() + + ab.AppendValues(want, nil) + ab.AppendValues([]string{}, nil) + a = ab.NewStringViewArray() + assert.Equal(t, want, stringValues(a)) + a.Release() +} + +// TestStringReset tests the Reset() method on the String type by creating two different Strings and then +// reseting the contents of string2 with the values from string1. +func TestStringViewReset(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + sb1 := array.NewStringViewBuilder(mem) + sb2 := array.NewStringViewBuilder(mem) + defer sb1.Release() + defer sb2.Release() + + sb1.Append("string1") + sb1.AppendNull() + + var ( + string1 = sb1.NewStringViewArray() + string2 = sb2.NewStringViewArray() + + string1Data = string1.Data() + ) + string2.Reset(string1Data) + + assert.Equal(t, "string1", string2.Value(0)) +} diff --git a/go/arrow/compute/executor.go b/go/arrow/compute/executor.go index 1cba0b1e19f69..db89b206daf5f 100644 --- a/go/arrow/compute/executor.go +++ b/go/arrow/compute/executor.go @@ -171,6 +171,8 @@ func addComputeDataPrealloc(dt arrow.DataType, widths []bufferPrealloc) []buffer return append(widths, bufferPrealloc{bitWidth: 32, addLen: 1}) case arrow.LARGE_BINARY, arrow.LARGE_STRING, arrow.LARGE_LIST: return append(widths, bufferPrealloc{bitWidth: 64, addLen: 1}) + case arrow.STRING_VIEW, arrow.BINARY_VIEW: + return append(widths, bufferPrealloc{bitWidth: arrow.ViewHeaderSizeBytes * 8}) } return widths } @@ -1007,9 +1009,10 @@ func (v *vectorExecutor) WrapResults(ctx context.Context, out <-chan Datum, hasC case <-ctx.Done(): return nil case output = <-out: - if output == nil { + if output == nil || ctx.Err() != nil { return nil } + // if the inputs contained at least one chunked array // then we want to return chunked output if hasChunked { diff --git a/go/arrow/datatype.go b/go/arrow/datatype.go index 24113b55899dc..1e5d8fb98aa59 100644 --- a/go/arrow/datatype.go +++ b/go/arrow/datatype.go @@ -210,6 +210,11 @@ type BinaryDataType interface { binary() } +type BinaryViewDataType interface { + BinaryDataType + view() +} + type OffsetsDataType interface { DataType OffsetTypeTraits() OffsetTraits @@ -272,6 +277,8 @@ func (b BufferSpec) Equals(other BufferSpec) bool { type DataTypeLayout struct { Buffers []BufferSpec HasDict bool + // VariadicSpec is what the buffers beyond len(Buffers) are expected to conform to. + VariadicSpec *BufferSpec } func SpecFixedWidth(w int) BufferSpec { return BufferSpec{KindFixedWidth, w} } diff --git a/go/arrow/datatype_binary.go b/go/arrow/datatype_binary.go index a3a8568645052..f3e601f08ec79 100644 --- a/go/arrow/datatype_binary.go +++ b/go/arrow/datatype_binary.go @@ -83,16 +83,57 @@ func (t *LargeStringType) Layout() DataTypeLayout { func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits } func (LargeStringType) IsUtf8() bool { return true } +type BinaryViewType struct{} + +func (*BinaryViewType) ID() Type { return BINARY_VIEW } +func (*BinaryViewType) Name() string { return "binary_view" } +func (*BinaryViewType) String() string { return "binary_view" } +func (*BinaryViewType) IsUtf8() bool { return false } +func (*BinaryViewType) binary() {} +func (*BinaryViewType) view() {} +func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) } +func (*BinaryViewType) Layout() DataTypeLayout { + variadic := SpecVariableWidth() + return DataTypeLayout{ + Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(ViewHeaderSizeBytes)}, + VariadicSpec: &variadic, + } +} + +type StringViewType struct{} + +func (*StringViewType) ID() Type { return STRING_VIEW } +func (*StringViewType) Name() string { return "string_view" } +func (*StringViewType) String() string { return "string_view" } +func (*StringViewType) IsUtf8() bool { return true } +func (*StringViewType) binary() {} +func (*StringViewType) view() {} +func (t *StringViewType) Fingerprint() string { return typeFingerprint(t) } +func (*StringViewType) Layout() DataTypeLayout { + variadic := SpecVariableWidth() + return DataTypeLayout{ + Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(ViewHeaderSizeBytes)}, + VariadicSpec: &variadic, + } +} + var ( BinaryTypes = struct { Binary BinaryDataType String BinaryDataType LargeBinary BinaryDataType LargeString BinaryDataType + BinaryView BinaryDataType + StringView BinaryDataType }{ Binary: &BinaryType{}, String: &StringType{}, LargeBinary: &LargeBinaryType{}, LargeString: &LargeStringType{}, + BinaryView: &BinaryViewType{}, + StringView: &StringViewType{}, } + + _ BinaryViewDataType = (*StringViewType)(nil) + _ BinaryViewDataType = (*BinaryViewType)(nil) ) diff --git a/go/arrow/datatype_binary_test.go b/go/arrow/datatype_binary_test.go index 25ba6e8db4ba4..083d69ee3e5d4 100644 --- a/go/arrow/datatype_binary_test.go +++ b/go/arrow/datatype_binary_test.go @@ -81,3 +81,33 @@ func TestLargeStringType(t *testing.T) { t.Fatalf("invalid string type stringer. got=%v, want=%v", got, want) } } + +func TestBinaryViewType(t *testing.T) { + var nt *arrow.BinaryViewType + if got, want := nt.ID(), arrow.BINARY_VIEW; got != want { + t.Fatalf("invalid string type id. got=%v, want=%v", got, want) + } + + if got, want := nt.Name(), "binary_view"; got != want { + t.Fatalf("invalid string type name. got=%v, want=%v", got, want) + } + + if got, want := nt.String(), "binary_view"; got != want { + t.Fatalf("invalid string type stringer. got=%v, want=%v", got, want) + } +} + +func TestStringViewType(t *testing.T) { + var nt *arrow.StringViewType + if got, want := nt.ID(), arrow.STRING_VIEW; got != want { + t.Fatalf("invalid string type id. got=%v, want=%v", got, want) + } + + if got, want := nt.Name(), "string_view"; got != want { + t.Fatalf("invalid string type name. got=%v, want=%v", got, want) + } + + if got, want := nt.String(), "string_view"; got != want { + t.Fatalf("invalid string type stringer. got=%v, want=%v", got, want) + } +} diff --git a/go/arrow/datatype_viewheader.go b/go/arrow/datatype_viewheader.go new file mode 100644 index 0000000000000..54b9256b34604 --- /dev/null +++ b/go/arrow/datatype_viewheader.go @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arrow + +import ( + "bytes" + "unsafe" + + "github.com/apache/arrow/go/v15/arrow/endian" + "github.com/apache/arrow/go/v15/arrow/internal/debug" + "github.com/apache/arrow/go/v15/arrow/memory" +) + +const ( + ViewPrefixLen = 4 + viewInlineSize = 12 +) + +func IsViewInline(length int) bool { + return length < viewInlineSize +} + +// ViewHeader is a variable length string (utf8) or byte slice with +// a 4 byte prefix and inline optimization for small values (12 bytes +// or fewer). This is similar to Go's standard string but limited by +// a length of Uint32Max and up to the first four bytes of the string +// are copied into the struct. This prefix allows failing comparisons +// early and can reduce CPU cache working set when dealing with short +// strings. +// +// There are two situations: +// +// Entirely inlined string data +// |----|------------| +// ^ ^ +// | | +// size inline string data, zero padded +// +// Reference into buffer +// |----|----|----|----| +// ^ ^ ^ ^ +// | | | | +// size prefix buffer index and offset to out-of-line portion +// +// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. +// +// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf +type ViewHeader struct { + size int32 + // the first 4 bytes of this are the prefix for the string + // if size <= StringHeaderInlineSize, then the entire string + // is in the data array and is zero padded. + // if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32 + // values which are the buffer index and offset in that buffer + // containing the full string. + data [viewInlineSize]byte +} + +func (sh *ViewHeader) IsInline() bool { + return sh.size <= int32(viewInlineSize) +} + +func (sh *ViewHeader) Len() int { return int(sh.size) } +func (sh *ViewHeader) Prefix() [ViewPrefixLen]byte { + return *(*[4]byte)(unsafe.Pointer(&sh.data)) +} + +func (sh *ViewHeader) BufferIndex() int32 { + return int32(endian.Native.Uint32(sh.data[ViewPrefixLen:])) +} + +func (sh *ViewHeader) BufferOffset() int32 { + return int32(endian.Native.Uint32(sh.data[ViewPrefixLen+4:])) +} + +func (sh *ViewHeader) InlineBytes() (data []byte) { + debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline ViewHeader") + return sh.data[:sh.size] +} + +func (sh *ViewHeader) SetBytes(data []byte) int { + sh.size = int32(len(data)) + if sh.IsInline() { + return copy(sh.data[:], data) + } + return copy(sh.data[:4], data) +} + +func (sh *ViewHeader) SetString(data string) int { + sh.size = int32(len(data)) + if sh.IsInline() { + return copy(sh.data[:], data) + } + return copy(sh.data[:4], data) +} + +func (sh *ViewHeader) SetIndexOffset(bufferIndex, offset int32) { + endian.Native.PutUint32(sh.data[ViewPrefixLen:], uint32(bufferIndex)) + endian.Native.PutUint32(sh.data[ViewPrefixLen+4:], uint32(offset)) +} + +func (sh *ViewHeader) Equals(buffers []*memory.Buffer, other *ViewHeader, otherBuffers []*memory.Buffer) bool { + if sh.sizeAndPrefixAsInt64() != other.sizeAndPrefixAsInt64() { + return false + } + + if sh.IsInline() { + return sh.inlinedAsInt64() == other.inlinedAsInt64() + } + + return bytes.Equal(sh.getBufferBytes(buffers), other.getBufferBytes(otherBuffers)) +} + +func (sh *ViewHeader) getBufferBytes(buffers []*memory.Buffer) []byte { + offset := sh.BufferOffset() + return buffers[sh.BufferIndex()].Bytes()[offset : offset+sh.size] +} + +func (sh *ViewHeader) inlinedAsInt64() int64 { + s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) + return s[1] +} + +func (sh *ViewHeader) sizeAndPrefixAsInt64() int64 { + s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) + return s[0] +} diff --git a/go/arrow/datatype_viewheader_inline.go b/go/arrow/datatype_viewheader_inline.go new file mode 100644 index 0000000000000..89ac1d06adcdf --- /dev/null +++ b/go/arrow/datatype_viewheader_inline.go @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.20 + +package arrow + +import ( + "unsafe" + + "github.com/apache/arrow/go/v15/arrow/internal/debug" +) + +func (sh *ViewHeader) InlineString() (data string) { + debug.Assert(sh.IsInline(), "calling InlineString on non-inline ViewHeader") + + return unsafe.String((*byte)(unsafe.Pointer(&sh.data)), sh.size) +} diff --git a/go/arrow/datatype_viewheader_inline_go1.19.go b/go/arrow/datatype_viewheader_inline_go1.19.go new file mode 100644 index 0000000000000..aec66009d9492 --- /dev/null +++ b/go/arrow/datatype_viewheader_inline_go1.19.go @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !go1.20 && !tinygo + +package arrow + +import ( + "reflect" + "unsafe" + + "github.com/apache/arrow/go/v15/arrow/internal/debug" +) + +func (sh *ViewHeader) InlineString() (data string) { + debug.Assert(sh.IsInline(), "calling InlineString on non-inline ViewHeader") + + h := (*reflect.StringHeader)(unsafe.Pointer(&data)) + h.Data = uintptr(unsafe.Pointer(&sh.data)) + h.Len = int(sh.size) + return +} diff --git a/go/arrow/datatype_viewheader_inline_tinygo.go b/go/arrow/datatype_viewheader_inline_tinygo.go new file mode 100644 index 0000000000000..bff63a273a722 --- /dev/null +++ b/go/arrow/datatype_viewheader_inline_tinygo.go @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !go1.20 && tinygo + +package arrow + +import ( + "reflect" + "unsafe" + + "github.com/apache/arrow/go/v15/arrow/internal/debug" +) + +func (sh *ViewHeader) InlineString() (data string) { + debug.Assert(sh.IsInline(), "calling InlineString on non-inline ViewHeader") + + h := (*reflect.StringHeader)(unsafe.Pointer(&data)) + h.Data = uintptr(unsafe.Pointer(&sh.data)) + h.Len = uintptr(sh.size) + return +} diff --git a/go/arrow/internal/arrdata/arrdata.go b/go/arrow/internal/arrdata/arrdata.go index 6631e4245c19d..985388094eb51 100644 --- a/go/arrow/internal/arrdata/arrdata.go +++ b/go/arrow/internal/arrdata/arrdata.go @@ -54,6 +54,7 @@ func init() { Records["extension"] = makeExtensionRecords() Records["union"] = makeUnionRecords() Records["run_end_encoded"] = makeRunEndEncodedRecords() + Records["view_types"] = makeStringViewRecords() for k := range Records { RecordNames = append(RecordNames, k) @@ -1155,6 +1156,65 @@ func makeRunEndEncodedRecords() []arrow.Record { return recs } +func makeStringViewRecords() []arrow.Record { + mem := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "binary_view", Type: arrow.BinaryTypes.BinaryView, Nullable: true}, + {Name: "string_view", Type: arrow.BinaryTypes.StringView, Nullable: true}, + }, nil) + + mask := []bool{true, false, false, true, true} + chunks := [][]arrow.Array{ + { + viewTypeArrayOf(mem, [][]byte{[]byte("1é"), []byte("2"), []byte("3"), []byte("4"), []byte("5")}, mask), + viewTypeArrayOf(mem, []string{"1é", "2", "3", "4", "5"}, mask), + }, + { + viewTypeArrayOf(mem, [][]byte{[]byte("1é"), []byte("22222222222222"), []byte("33333333333333"), []byte("4444"), []byte("5555")}, mask), + viewTypeArrayOf(mem, []string{"1é", "22222222222222", "33333333333333", "4444", "5555"}, nil), + }, + { + viewTypeArrayOf(mem, [][]byte{[]byte("1é1é"), []byte("22222222222222"), []byte("33333333333333"), []byte("44"), []byte("55")}, nil), + viewTypeArrayOf(mem, []string{"1é1é", "22222222222222", "33333333333333", "44", "55"}, mask), + }, + } + + defer func() { + for _, chunk := range chunks { + for _, col := range chunk { + col.Release() + } + } + }() + + recs := make([]arrow.Record, len(chunks)) + for i, chunk := range chunks { + recs[i] = array.NewRecord(schema, chunk, -1) + } + + return recs +} + +func viewTypeArrayOf(mem memory.Allocator, a interface{}, valids []bool) arrow.Array { + if mem == nil { + mem = memory.NewGoAllocator() + } + + switch a := a.(type) { + case []string: + bldr := array.NewStringViewBuilder(mem) + defer bldr.Release() + bldr.AppendValues(a, valids) + return bldr.NewArray() + case [][]byte: + bldr := array.NewBinaryViewBuilder(mem) + defer bldr.Release() + bldr.AppendValues(a, valids) + return bldr.NewArray() + } + return nil +} + func extArray(mem memory.Allocator, dt arrow.ExtensionType, a interface{}, valids []bool) arrow.Array { var storage arrow.Array switch st := dt.StorageType().(type) { @@ -1750,5 +1810,26 @@ func buildArray(bldr array.Builder, data arrow.Array) { bldr.AppendNull() } } + + case *array.BinaryViewBuilder: + data := data.(*array.BinaryView) + for i := 0; i < data.Len(); i++ { + switch { + case data.IsValid(i): + bldr.Append(data.Value(i)) + default: + bldr.AppendNull() + } + } + case *array.StringViewBuilder: + data := data.(*array.StringView) + for i := 0; i < data.Len(); i++ { + switch { + case data.IsValid(i): + bldr.Append(data.Value(i)) + default: + bldr.AppendNull() + } + } } } diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 87bdc1f44d875..f74b615362642 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -158,6 +158,10 @@ func typeToJSON(arrowType arrow.DataType) (json.RawMessage, error) { typ = nameJSON{"utf8"} case *arrow.LargeStringType: typ = nameJSON{"largeutf8"} + case *arrow.BinaryViewType: + typ = nameJSON{"binaryview"} + case *arrow.StringViewType: + typ = nameJSON{"utf8view"} case *arrow.Date32Type: typ = unitZoneJSON{Name: "date", Unit: "DAY"} case *arrow.Date64Type: @@ -342,6 +346,10 @@ func typeFromJSON(typ json.RawMessage, children []FieldWrapper) (arrowType arrow arrowType = arrow.BinaryTypes.String case "largeutf8": arrowType = arrow.BinaryTypes.LargeString + case "binaryview": + arrowType = arrow.BinaryTypes.BinaryView + case "utf8view": + arrowType = arrow.BinaryTypes.StringView case "date": t := unitZoneJSON{} if err = json.Unmarshal(typ, &t); err != nil { @@ -818,6 +826,7 @@ type Array struct { Offset interface{} `json:"OFFSET,omitempty"` Size interface{} `json:"SIZE,omitempty"` Children []Array `json:"children,omitempty"` + Variadic []string `json:"VARIADIC_BUFFERS,omitempty"` } func (a *Array) MarshalJSON() ([]byte, error) { @@ -1078,6 +1087,18 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) arrow.Arr bldr.AppendValues(data, valids) return returnNewArrayData(bldr) + case arrow.BinaryViewDataType: + valids := validsToBitmap(validsFromJSON(arr.Valids), mem) + nulls := arr.Count - bitutil.CountSetBits(valids.Bytes(), 0, arr.Count) + headers := stringHeadersFromJSON(mem, !dt.IsUtf8(), arr.Data) + extraBufs := variadicBuffersFromJSON(arr.Variadic) + defer valids.Release() + defer headers.Release() + + return array.NewData(dt, arr.Count, + append([]*memory.Buffer{valids, headers}, extraBufs...), + nil, nulls, 0) + case *arrow.ListType: valids := validsFromJSON(arr.Valids) elems := arrayFromJSON(mem, dt.Elem(), arr.Children[0]) @@ -1486,6 +1507,24 @@ func arrayToJSON(field arrow.Field, arr arrow.Array) Array { Offset: strOffsets, } + case *array.StringView: + variadic := variadicBuffersToJSON(arr.Data().Buffers()[2:]) + return Array{ + Name: field.Name, + Count: arr.Len(), + Valids: validsToJSON(arr), + Data: stringHeadersToJSON(arr, false), + Variadic: variadic, + } + case *array.BinaryView: + variadic := variadicBuffersToJSON(arr.Data().Buffers()[2:]) + return Array{ + Name: field.Name, + Count: arr.Len(), + Valids: validsToJSON(arr), + Data: stringHeadersToJSON(arr, true), + Variadic: variadic, + } case *array.List: o := Array{ Name: field.Name, @@ -2309,3 +2348,114 @@ func durationToJSON(arr *array.Duration) []interface{} { } return o } + +func variadicBuffersFromJSON(bufs []string) []*memory.Buffer { + out := make([]*memory.Buffer, len(bufs)) + for i, data := range bufs { + rawData, err := hex.DecodeString(data) + if err != nil { + panic(err) + } + + out[i] = memory.NewBufferBytes(rawData) + } + return out +} + +func variadicBuffersToJSON(bufs []*memory.Buffer) []string { + out := make([]string, len(bufs)) + for i, data := range bufs { + out[i] = strings.ToUpper(hex.EncodeToString(data.Bytes())) + } + return out +} + +func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface{}) *memory.Buffer { + buf := memory.NewResizableBuffer(mem) + buf.Resize(arrow.ViewHeaderTraits.BytesRequired(len(data))) + + values := arrow.ViewHeaderTraits.CastFromBytes(buf.Bytes()) + + for i, d := range data { + switch v := d.(type) { + case nil: + continue + case map[string]interface{}: + if inlined, ok := v["INLINED"]; ok { + if isBinary { + val, err := hex.DecodeString(inlined.(string)) + if err != nil { + panic(fmt.Errorf("could not decode %v: %v", inlined, err)) + } + values[i].SetBytes(val) + } else { + values[i].SetString(inlined.(string)) + } + continue + } + + idx, offset := v["BUFFER_INDEX"].(json.Number), v["OFFSET"].(json.Number) + bufIdx, err := idx.Int64() + if err != nil { + panic(err) + } + + bufOffset, err := offset.Int64() + if err != nil { + panic(err) + } + + values[i].SetIndexOffset(int32(bufIdx), int32(bufOffset)) + prefix, err := hex.DecodeString(v["PREFIX"].(string)) + if err != nil { + panic(err) + } + sz, err := v["SIZE"].(json.Number).Int64() + if err != nil { + panic(err) + } + + rawData := make([]byte, sz) + copy(rawData, prefix) + values[i].SetBytes(rawData) + } + } + return buf +} + +func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} { + type StringHeader struct { + Size int `json:"SIZE"` + Prefix *string `json:"PREFIX,omitempty"` + BufferIdx *int `json:"BUFFER_INDEX,omitempty"` + BufferOff *int `json:"OFFSET,omitempty"` + Inlined *string `json:"INLINED,omitempty"` + } + + o := make([]interface{}, arr.Len()) + for i := range o { + hdr := arr.ValueHeader(i) + if hdr.IsInline() { + data := hdr.InlineString() + if isBinary { + data = strings.ToUpper(hex.EncodeToString(hdr.InlineBytes())) + } + o[i] = StringHeader{ + Size: hdr.Len(), + Inlined: &data, + } + continue + } + + idx, off := int(hdr.BufferIndex()), int(hdr.BufferOffset()) + prefix := hdr.Prefix() + encodedPrefix := strings.ToUpper(hex.EncodeToString(prefix[:])) + o[i] = StringHeader{ + Size: hdr.Len(), + Prefix: &encodedPrefix, + BufferIdx: &idx, + BufferOff: &off, + } + } + return o +} diff --git a/go/arrow/internal/arrjson/arrjson_test.go b/go/arrow/internal/arrjson/arrjson_test.go index 7beadee370edb..31f3cb238ec16 100644 --- a/go/arrow/internal/arrjson/arrjson_test.go +++ b/go/arrow/internal/arrjson/arrjson_test.go @@ -48,6 +48,7 @@ func TestReadWrite(t *testing.T) { wantJSONs["dictionary"] = makeDictionaryWantJSONs() wantJSONs["union"] = makeUnionWantJSONs() wantJSONs["run_end_encoded"] = makeRunEndEncodedWantJSONs() + wantJSONs["view_types"] = makeViewTypesWantJSONs() tempDir := t.TempDir() for name, recs := range arrdata.Records { @@ -6127,3 +6128,261 @@ func makeRunEndEncodedWantJSONs() string { ] }` } + +func makeViewTypesWantJSONs() string { + return `{ + "schema": { + "fields": [ + { + "name": "binary_view", + "type": { + "name": "binaryview" + }, + "nullable": true, + "children": [] + }, + { + "name": "string_view", + "type": { + "name": "utf8view" + }, + "nullable": true, + "children": [] + } + ] + }, + "batches": [ + { + "count": 5, + "columns": [ + { + "name": "binary_view", + "count": 5, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 3, + "INLINED": "31C3A9" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 1, + "INLINED": "34" + }, + { + "SIZE": 1, + "INLINED": "35" + } + ], + "VARIADIC_BUFFERS": [""] + }, + { + "name": "string_view", + "count": 5, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 3, + "INLINED": "1é" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 1, + "INLINED": "4" + }, + { + "SIZE": 1, + "INLINED": "5" + } + ], + "VARIADIC_BUFFERS": [""] + } + ] + }, + { + "count": 5, + "columns": [ + { + "name": "binary_view", + "count": 5, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 3, + "INLINED": "31C3A9" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 4, + "INLINED": "34343434" + }, + { + "SIZE": 4, + "INLINED": "35353535" + } + ], + "VARIADIC_BUFFERS": [""] + }, + { + "name": "string_view", + "count": 5, + "VALIDITY": [ + 1, + 1, + 1, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 3, + "INLINED": "1é" + }, + { + "SIZE": 14, + "PREFIX": "32323232", + "BUFFER_INDEX": 0, + "OFFSET": 0 + }, + { + "SIZE": 14, + "PREFIX": "33333333", + "BUFFER_INDEX": 0, + "OFFSET": 14 + }, + { + "SIZE": 4, + "INLINED": "4444" + }, + { + "SIZE": 4, + "INLINED": "5555" + } + ], + "VARIADIC_BUFFERS": [ + "32323232323232323232323232323333333333333333333333333333" + ] + } + ] + }, + { + "count": 5, + "columns": [ + { + "name": "binary_view", + "count": 5, + "VALIDITY": [ + 1, + 1, + 1, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 6, + "INLINED": "31C3A931C3A9" + }, + { + "SIZE": 14, + "PREFIX": "32323232", + "BUFFER_INDEX": 0, + "OFFSET": 0 + }, + { + "SIZE": 14, + "PREFIX": "33333333", + "BUFFER_INDEX": 0, + "OFFSET": 14 + }, + { + "SIZE": 2, + "INLINED": "3434" + }, + { + "SIZE": 2, + "INLINED": "3535" + } + ], + "VARIADIC_BUFFERS": [ + "32323232323232323232323232323333333333333333333333333333" + ] + }, + { + "name": "string_view", + "count": 5, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 6, + "INLINED": "1é1é" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 2, + "INLINED": "44" + }, + { + "SIZE": 2, + "INLINED": "55" + } + ], + "VARIADIC_BUFFERS": [""] + } + ] + } + ] +}` +} diff --git a/go/arrow/internal/flatbuf/MetadataVersion.go b/go/arrow/internal/flatbuf/MetadataVersion.go index 21b234f9c2b21..bb5e99dd588ad 100644 --- a/go/arrow/internal/flatbuf/MetadataVersion.go +++ b/go/arrow/internal/flatbuf/MetadataVersion.go @@ -31,7 +31,7 @@ const ( MetadataVersionV3 MetadataVersion = 2 /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. MetadataVersionV4 MetadataVersion = 3 - /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 /// metadata and IPC messages). Implementations are recommended to provide a /// V4 compatibility mode with V5 format changes disabled. /// diff --git a/go/arrow/internal/testing/gen/random_array_gen.go b/go/arrow/internal/testing/gen/random_array_gen.go index b42273ff93fac..57b417bd2b878 100644 --- a/go/arrow/internal/testing/gen/random_array_gen.go +++ b/go/arrow/internal/testing/gen/random_array_gen.go @@ -351,6 +351,40 @@ func (r *RandomArrayGenerator) LargeString(size int64, minLength, maxLength int6 return bldr.NewArray() } +func (r *RandomArrayGenerator) StringView(size int64, minLength, maxLength int64, nullProb float64) arrow.Array { + return r.generateBinaryView(arrow.BinaryTypes.StringView, size, minLength, maxLength, nullProb) +} + +func (r *RandomArrayGenerator) generateBinaryView(dt arrow.DataType, size int64, minLength, maxLength int64, nullProb float64) arrow.Array { + lengths := r.Int32(size, int32(minLength), int32(maxLength), nullProb).(*array.Int32) + defer lengths.Release() + + bldr := array.NewBuilder(r.mem, dt).(array.StringLikeBuilder) + defer bldr.Release() + + r.extra++ + dist := rand.New(rand.NewSource(r.seed + r.extra)) + + buf := make([]byte, 0, maxLength) + gen := func(n int32) string { + out := buf[:n] + for i := range out { + out[i] = uint8(dist.Int31n(int32('z')-int32('A')+1) + int32('A')) + } + return string(out) + } + + for i := 0; i < lengths.Len(); i++ { + if lengths.IsNull(i) { + bldr.AppendNull() + continue + } + bldr.Append(gen(lengths.Value(i))) + } + + return bldr.NewArray() +} + func (r *RandomArrayGenerator) Numeric(dt arrow.Type, size int64, min, max int64, nullprob float64) arrow.Array { switch dt { case arrow.INT8: diff --git a/go/arrow/ipc/endian_swap.go b/go/arrow/ipc/endian_swap.go index d2e0948434abc..35ba0e4e764f9 100644 --- a/go/arrow/ipc/endian_swap.go +++ b/go/arrow/ipc/endian_swap.go @@ -18,6 +18,7 @@ package ipc import ( "errors" + "fmt" "math/bits" "github.com/apache/arrow/go/v15/arrow" @@ -119,7 +120,10 @@ func swapType(dt arrow.DataType, data *array.Data) (err error) { return swapType(dt.IndexType, data) case arrow.FixedWidthDataType: byteSwapBuffer(dt.BitWidth(), data.Buffers()[1]) + default: + err = fmt.Errorf("%w: swapping endianness of %s", arrow.ErrNotImplemented, dt) } + return } diff --git a/go/arrow/ipc/file_reader.go b/go/arrow/ipc/file_reader.go index 330355d3a60c3..1c7eb31799cfa 100644 --- a/go/arrow/ipc/file_reader.go +++ b/go/arrow/ipc/file_reader.go @@ -430,13 +430,18 @@ func (src *ipcSource) fieldMetadata(i int) *flatbuf.FieldNode { return &node } +func (src *ipcSource) variadicCount(i int) int64 { + return src.meta.VariadicBufferCounts(i) +} + type arrayLoaderContext struct { - src ipcSource - ifield int - ibuffer int - max int - memo *dictutils.Memo - version MetadataVersion + src ipcSource + ifield int + ibuffer int + ivariadic int + max int + memo *dictutils.Memo + version MetadataVersion } func (ctx *arrayLoaderContext) field() *flatbuf.FieldNode { @@ -451,6 +456,12 @@ func (ctx *arrayLoaderContext) buffer() *memory.Buffer { return buf } +func (ctx *arrayLoaderContext) variadic() int64 { + v := ctx.src.variadicCount(ctx.ivariadic) + ctx.ivariadic++ + return v +} + func (ctx *arrayLoaderContext) loadArray(dt arrow.DataType) arrow.ArrayData { switch dt := dt.(type) { case *arrow.NullType: @@ -476,6 +487,9 @@ func (ctx *arrayLoaderContext) loadArray(dt arrow.DataType) arrow.ArrayData { case *arrow.BinaryType, *arrow.StringType, *arrow.LargeStringType, *arrow.LargeBinaryType: return ctx.loadBinary(dt) + case arrow.BinaryViewDataType: + return ctx.loadBinaryView(dt) + case *arrow.FixedSizeBinaryType: return ctx.loadFixedSizeBinary(dt) @@ -582,6 +596,18 @@ func (ctx *arrayLoaderContext) loadBinary(dt arrow.DataType) arrow.ArrayData { return array.NewData(dt, int(field.Length()), buffers, nil, int(field.NullCount()), 0) } +func (ctx *arrayLoaderContext) loadBinaryView(dt arrow.DataType) arrow.ArrayData { + nVariadicBufs := ctx.variadic() + field, buffers := ctx.loadCommon(dt.ID(), 2+int(nVariadicBufs)) + buffers = append(buffers, ctx.buffer()) + for i := 0; i < int(nVariadicBufs); i++ { + buffers = append(buffers, ctx.buffer()) + } + defer releaseBuffers(buffers) + + return array.NewData(dt, int(field.Length()), buffers, nil, int(field.NullCount()), 0) +} + func (ctx *arrayLoaderContext) loadFixedSizeBinary(dt *arrow.FixedSizeBinaryType) arrow.ArrayData { field, buffers := ctx.loadCommon(dt.ID(), 2) buffers = append(buffers, ctx.buffer()) diff --git a/go/arrow/ipc/message.go b/go/arrow/ipc/message.go index 709aa5aa2dba4..5295c5df30137 100644 --- a/go/arrow/ipc/message.go +++ b/go/arrow/ipc/message.go @@ -31,11 +31,11 @@ import ( type MetadataVersion flatbuf.MetadataVersion const ( - MetadataV1 = MetadataVersion(flatbuf.MetadataVersionV1) // version for Arrow-0.1.0 - MetadataV2 = MetadataVersion(flatbuf.MetadataVersionV2) // version for Arrow-0.2.0 - MetadataV3 = MetadataVersion(flatbuf.MetadataVersionV3) // version for Arrow-0.3.0 to 0.7.1 - MetadataV4 = MetadataVersion(flatbuf.MetadataVersionV4) // version for >= Arrow-0.8.0 - MetadataV5 = MetadataVersion(flatbuf.MetadataVersionV5) // version for >= Arrow-1.0.0, backward compatible with v4 + MetadataV1 = MetadataVersion(flatbuf.MetadataVersionV1) // version for Arrow Format-0.1.0 + MetadataV2 = MetadataVersion(flatbuf.MetadataVersionV2) // version for Arrow Format-0.2.0 + MetadataV3 = MetadataVersion(flatbuf.MetadataVersionV3) // version for Arrow Format-0.3.0 to 0.7.1 + MetadataV4 = MetadataVersion(flatbuf.MetadataVersionV4) // version for >= Arrow Format-0.8.0 + MetadataV5 = MetadataVersion(flatbuf.MetadataVersionV5) // version for >= Arrow Format-1.0.0, backward compatible with v4 ) func (m MetadataVersion) String() string { diff --git a/go/arrow/ipc/metadata.go b/go/arrow/ipc/metadata.go index bd437834c3d06..54ef58753a173 100644 --- a/go/arrow/ipc/metadata.go +++ b/go/arrow/ipc/metadata.go @@ -323,6 +323,16 @@ func (fv *fieldVisitor) visit(field arrow.Field) { flatbuf.LargeUtf8Start(fv.b) fv.offset = flatbuf.LargeUtf8End(fv.b) + case *arrow.BinaryViewType: + fv.dtype = flatbuf.TypeBinaryView + flatbuf.BinaryViewStart(fv.b) + fv.offset = flatbuf.BinaryViewEnd(fv.b) + + case *arrow.StringViewType: + fv.dtype = flatbuf.TypeUtf8View + flatbuf.Utf8ViewStart(fv.b) + fv.offset = flatbuf.Utf8ViewEnd(fv.b) + case *arrow.Date32Type: fv.dtype = flatbuf.TypeDate flatbuf.DateStart(fv.b) @@ -713,6 +723,12 @@ func concreteTypeFromFB(typ flatbuf.Type, data flatbuffers.Table, children []arr case flatbuf.TypeLargeUtf8: return arrow.BinaryTypes.LargeString, nil + case flatbuf.TypeUtf8View: + return arrow.BinaryTypes.StringView, nil + + case flatbuf.TypeBinaryView: + return arrow.BinaryTypes.BinaryView, nil + case flatbuf.TypeBool: return arrow.FixedWidthTypes.Boolean, nil @@ -1168,15 +1184,15 @@ func writeFileFooter(schema *arrow.Schema, dicts, recs []fileBlock, w io.Writer) return err } -func writeRecordMessage(mem memory.Allocator, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType) *memory.Buffer { +func writeRecordMessage(mem memory.Allocator, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType, variadicCounts []int64) *memory.Buffer { b := flatbuffers.NewBuilder(0) - recFB := recordToFB(b, size, bodyLength, fields, meta, codec) + recFB := recordToFB(b, size, bodyLength, fields, meta, codec, variadicCounts) return writeMessageFB(b, mem, flatbuf.MessageHeaderRecordBatch, recFB, bodyLength) } -func writeDictionaryMessage(mem memory.Allocator, id int64, isDelta bool, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType) *memory.Buffer { +func writeDictionaryMessage(mem memory.Allocator, id int64, isDelta bool, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType, variadicCounts []int64) *memory.Buffer { b := flatbuffers.NewBuilder(0) - recFB := recordToFB(b, size, bodyLength, fields, meta, codec) + recFB := recordToFB(b, size, bodyLength, fields, meta, codec, variadicCounts) flatbuf.DictionaryBatchStart(b) flatbuf.DictionaryBatchAddId(b, id) @@ -1186,7 +1202,7 @@ func writeDictionaryMessage(mem memory.Allocator, id int64, isDelta bool, size, return writeMessageFB(b, mem, flatbuf.MessageHeaderDictionaryBatch, dictFB, bodyLength) } -func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType) flatbuffers.UOffsetT { +func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType, variadicCounts []int64) flatbuffers.UOffsetT { fieldsFB := writeFieldNodes(b, fields, flatbuf.RecordBatchStartNodesVector) metaFB := writeBuffers(b, meta, flatbuf.RecordBatchStartBuffersVector) var bodyCompressFB flatbuffers.UOffsetT @@ -1194,10 +1210,24 @@ func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMe bodyCompressFB = writeBodyCompression(b, codec) } + var vcFB *flatbuffers.UOffsetT + if len(variadicCounts) > 0 { + flatbuf.RecordBatchStartVariadicBufferCountsVector(b, len(variadicCounts)) + for i := len(variadicCounts) - 1; i >= 0; i-- { + b.PrependInt64(variadicCounts[i]) + } + vcFBVal := b.EndVector(len(variadicCounts)) + vcFB = &vcFBVal + } + flatbuf.RecordBatchStart(b) flatbuf.RecordBatchAddLength(b, size) flatbuf.RecordBatchAddNodes(b, fieldsFB) flatbuf.RecordBatchAddBuffers(b, metaFB) + if vcFB != nil { + flatbuf.RecordBatchAddVariadicBufferCounts(b, *vcFB) + } + if codec != -1 { flatbuf.RecordBatchAddCompression(b, bodyCompressFB) } diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index 58c56d2d16ccf..e9d59f0e35e00 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -277,7 +277,7 @@ type dictEncoder struct { } func (d *dictEncoder) encodeMetadata(p *Payload, isDelta bool, id, nrows int64) error { - p.meta = writeDictionaryMessage(d.mem, id, isDelta, nrows, p.size, d.fields, d.meta, d.codec) + p.meta = writeDictionaryMessage(d.mem, id, isDelta, nrows, p.size, d.fields, d.meta, d.codec, d.variadicCounts) return nil } @@ -300,8 +300,9 @@ func (d *dictEncoder) Encode(p *Payload, id int64, isDelta bool, dict arrow.Arra type recordEncoder struct { mem memory.Allocator - fields []fieldMetadata - meta []bufferMetadata + fields []fieldMetadata + meta []bufferMetadata + variadicCounts []int64 depth int64 start int64 @@ -602,6 +603,33 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error { p.body = append(p.body, voffsets) p.body = append(p.body, values) + case arrow.BinaryViewDataType: + data := arr.Data() + values := data.Buffers()[1] + arrLen := int64(arr.Len()) + typeWidth := int64(arrow.ViewHeaderSizeBytes) + minLength := paddedLength(arrLen*typeWidth, kArrowAlignment) + + switch { + case needTruncate(int64(data.Offset()), values, minLength): + // non-zero offset: slice the buffer + offset := data.Offset() * int(typeWidth) + // send padding if available + len := int(minI64(bitutil.CeilByte64(arrLen*typeWidth), int64(values.Len()-offset))) + values = memory.SliceBuffer(values, offset, len) + default: + if values != nil { + values.Retain() + } + } + p.body = append(p.body, values) + + w.variadicCounts = append(w.variadicCounts, int64(len(data.Buffers())-2)) + for _, b := range data.Buffers()[2:] { + b.Retain() + p.body = append(p.body, b) + } + case *arrow.StructType: w.depth-- arr := arr.(*array.Struct) @@ -946,7 +974,7 @@ func (w *recordEncoder) Encode(p *Payload, rec arrow.Record) error { } func (w *recordEncoder) encodeMetadata(p *Payload, nrows int64) error { - p.meta = writeRecordMessage(w.mem, nrows, p.size, w.fields, w.meta, w.codec) + p.meta = writeRecordMessage(w.mem, nrows, p.size, w.fields, w.meta, w.codec, w.variadicCounts) return nil } diff --git a/go/arrow/type_traits_view.go b/go/arrow/type_traits_view.go new file mode 100644 index 0000000000000..c3846db294681 --- /dev/null +++ b/go/arrow/type_traits_view.go @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arrow + +import ( + "reflect" + "unsafe" + + "github.com/apache/arrow/go/v15/arrow/endian" +) + +var ViewHeaderTraits viewHeaderTraits + +const ( + ViewHeaderSizeBytes = int(unsafe.Sizeof(ViewHeader{})) +) + +type viewHeaderTraits struct{} + +func (viewHeaderTraits) BytesRequired(n int) int { return ViewHeaderSizeBytes * n } + +func (viewHeaderTraits) PutValue(b []byte, v ViewHeader) { + endian.Native.PutUint32(b, uint32(v.size)) + copy(b[4:], v.data[:]) +} + +func (viewHeaderTraits) CastFromBytes(b []byte) (res []ViewHeader) { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + return unsafe.Slice((*ViewHeader)(unsafe.Pointer(h.Data)), cap(b)/ViewHeaderSizeBytes)[:len(b)/ViewHeaderSizeBytes] +} + +func (viewHeaderTraits) CastToBytes(b []ViewHeader) (res []byte) { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*ViewHeaderSizeBytes)[:len(b)*ViewHeaderSizeBytes] +} + +func (viewHeaderTraits) Copy(dst, src []ViewHeader) { copy(dst, src) } From a886fdaa2d80a2e7f56cf8a3cf94b367443b6e8e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 14 Nov 2023 22:19:48 -0400 Subject: [PATCH 07/23] GH-38715: [R] Fix possible bashism in configure script (#38716) ### Rationale for this change The CRAN incoming check for 14.0.0 is failing with a NOTE about a possible bashism ### What changes are included in this PR? One `test -a` usage was replaced with `&&`. ### Are these changes tested? Yes (via crossbow, below) ### Are there any user-facing changes? No * Closes: #38715 Authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- r/configure | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/configure b/r/configure index 5a7f0c81a947b..4f09cfdc4419b 100755 --- a/r/configure +++ b/r/configure @@ -126,7 +126,7 @@ fi # but doing it now allows us to catch it in # nixlibs.R and activate S3 and GCS support for the source build. -# macOS ships with libressl. openssl is installable with brew, but it is +# macOS ships with libressl. openssl is installable with brew, but it is # generally not linked. We can over-ride this and find # openssl by setting OPENSSL_ROOT_DIR (which cmake will pick up later in # the installation process). @@ -135,7 +135,7 @@ if [ "${OPENSSL_ROOT_DIR}" = "" ] && brew --prefix openssl >/dev/null 2>&1; then export PKG_CONFIG_PATH="${OPENSSL_ROOT_DIR}/lib/pkgconfig${PKG_CONFIG_PATH:+:${PKG_CONFIG_PATH}}" fi # Look for openssl with pkg-config for non-brew sources(e.g. CRAN) and Linux -if [ "${OPENSSL_ROOT_DIR}" = "" -a "${PKG_CONFIG_AVAILABLE}" = "true" ]; then +if [ "${OPENSSL_ROOT_DIR}" = "" ] && [ "${PKG_CONFIG_AVAILABLE}" = "true" ]; then if ${PKG_CONFIG} --exists openssl; then export OPENSSL_ROOT_DIR="`${PKG_CONFIG} --variable=prefix openssl`" fi @@ -282,7 +282,7 @@ set_pkg_vars () { PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS" fi - # We use expr because the product version returns more than just 10.13 and we want to + # We use expr because the product version returns more than just 10.13 and we want to # match the substring. However, expr always outputs the number of matched characters # to stdout, to avoid noise in the log we redirect the output to /dev/null if [ "$UNAME" = "Darwin" ] && expr $(sw_vers -productVersion) : '10\.13' >/dev/null 2>&1; then From e49d8ae15583ceff03237571569099a6ad62be32 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Wed, 15 Nov 2023 14:17:42 +0900 Subject: [PATCH 08/23] GH-38711: [CI] Rollback aws-cli for preview documentation (#38723) ### Rationale for this change Restored the Runner image to not delete `aws-cli` for the execution of the `preview-docs` command. ### Are these changes tested? No ### Are there any user-facing changes? No * Closes: #38711 Authored-by: Hyunseok Seo Signed-off-by: Sutou Kouhei --- ci/scripts/util_free_space.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/ci/scripts/util_free_space.sh b/ci/scripts/util_free_space.sh index 0518869d06993..dd6ba2c4600a9 100755 --- a/ci/scripts/util_free_space.sh +++ b/ci/scripts/util_free_space.sh @@ -25,7 +25,6 @@ du -hsc /usr/local/* echo "::endgroup::" # ~1GB sudo rm -rf \ - /usr/local/aws-cli \ /usr/local/aws-sam-cil \ /usr/local/julia* || : echo "::group::/usr/local/bin/*" @@ -34,8 +33,6 @@ echo "::endgroup::" # ~1GB (From 1.2GB to 214MB) sudo rm -rf \ /usr/local/bin/aliyun \ - /usr/local/bin/aws \ - /usr/local/bin/aws_completer \ /usr/local/bin/azcopy \ /usr/local/bin/bicep \ /usr/local/bin/cmake-gui \ From 5b17b8402e0444f1a9b9ce1fb4dc2b7b92e9aede Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 15 Nov 2023 09:21:22 +0100 Subject: [PATCH 09/23] GH-38712: [Python] Remove dead code in _reconstruct_block (#38714) ### Rationale for this change It seems the object case in `_reconstruct_block` is a dead code and is not needed anymore so therefore could be removed. ### What changes are included in this PR? Removal of the object case in `_reconstruct_block` code. Was also looking at the `arrow_to_pandas.cc` code to see if there is any dead code present and I couldn't find any. ### Are these changes tested? The change in this PR should not make any of the existing tests fail. ### Are there any user-facing changes? There shouldn't be. * Closes: #38712 Authored-by: AlenkaF Signed-off-by: Joris Van den Bossche --- python/pyarrow/pandas_compat.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index e232603ba45ac..be29f68a13d5f 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -26,7 +26,6 @@ from itertools import zip_longest import json import operator -import pickle import re import warnings @@ -721,9 +720,6 @@ def _reconstruct_block(item, columns=None, extension_columns=None): block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype) - elif 'object' in item: - block = _int.make_block(pickle.loads(block_arr), - placement=placement) elif 'py_array' in item: # create ExtensionBlock arr = item['py_array'] From cc627ee7e35807a98717603d0a2520685919e17c Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Wed, 15 Nov 2023 19:01:43 +0900 Subject: [PATCH 10/23] GH-38599: [Docs] Update Headers (#38696) ### Rationale for this change Noticed wrong section headings on the web doc and proposing a fix. ### Are these changes tested? Yes. Built and verified the documentation locally. image ### Are there any user-facing changes? No. * Closes: #38599 Authored-by: Hyunseok Seo Signed-off-by: Antoine Pitrou --- docs/source/format/CDeviceDataInterface.rst | 29 ++++++++++----------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst index b54e6eabe0b2d..a584852df87eb 100644 --- a/docs/source/format/CDeviceDataInterface.rst +++ b/docs/source/format/CDeviceDataInterface.rst @@ -277,7 +277,7 @@ has the following fields: to access the memory in the buffers. If an event is provided, then the producer MUST ensure that the exported - data is available on the device before the event is triggered. The + data is available on the device before the event is triggered. The consumer SHOULD wait on the event before trying to access the exported data. @@ -290,7 +290,7 @@ has the following fields: As non-CPU development expands, there may be a need to expand this structure. In order to do so without potentially breaking ABI changes, we reserve 24 bytes at the end of the object. These bytes MUST be zero'd - out after initialization by the producer in order to ensure safe + out after initialization by the producer in order to ensure safe evolution of the ABI in the future. .. _c-device-data-interface-event-types: @@ -300,7 +300,7 @@ Synchronization event types The table below lists the expected event types for each device type. If no event type is supported ("N/A"), then the ``sync_event`` member -should always be null. +should always be null. Remember that the event *CAN* be null if synchronization is not needed to access the data. @@ -352,7 +352,7 @@ Memory management ----------------- First and foremost: Out of everything in this interface, it is *only* the -data buffers themselves which reside in device memory (i.e. the ``buffers`` +data buffers themselves which reside in device memory (i.e. the ``buffers`` member of the ``ArrowArray`` struct). Everything else should be in CPU memory. @@ -408,7 +408,7 @@ see inconsistent data while the other is mutating it. Synchronization --------------- -If the ``sync_event`` member is non-NULL, the consumer should not attempt +If the ``sync_event`` member is non-NULL, the consumer should not attempt to access or read the data until they have synchronized on that event. If the ``sync_event`` member is NULL, then it MUST be safe to access the data without any synchronization necessary on the part of the consumer. @@ -501,7 +501,6 @@ could be used for any device: arr->array.release(&arr->array); } -======================= Device Stream Interface ======================= @@ -510,7 +509,7 @@ interface also specifies a higher-level structure for easing communication of streaming data within a single process. Semantics -========= +--------- An Arrow C device stream exposes a streaming source of data chunks, each with the same schema. Chunks are obtained by calling a blocking pull-style iteration @@ -520,7 +519,7 @@ to provide a stream of data on multiple device types, a producer should provide a separate stream object for each device type. Structure definition -==================== +-------------------- The C device stream interface is defined by a single ``struct`` definition: @@ -554,7 +553,7 @@ The C device stream interface is defined by a single ``struct`` definition: kept exactly as-is when these definitions are copied. The ArrowDeviceArrayStream structure ------------------------------------- +'''''''''''''''''''''''''''''''''''' The ``ArrowDeviceArrayStream`` provides a device type that can access the resulting data along with the required callbacks to interact with a @@ -627,20 +626,20 @@ streaming source of Arrow arrays. It has the following fields: handled by the producer, and especially by the release callback. Result lifetimes ----------------- +'''''''''''''''' The data returned by the ``get_schema`` and ``get_next`` callbacks must be released independantly. Their lifetimes are not tied to that of ``ArrowDeviceArrayStream``. Stream lifetime ---------------- +''''''''''''''' Lifetime of the C stream is managed using a release callback with similar usage as in :ref:`C data interface `. Thread safety -------------- +''''''''''''' The stream source is not assumed to be thread-safe. Consumers wanting to call ``get_next`` from several threads should ensure those calls are @@ -652,9 +651,9 @@ Interoperability with other interchange formats Other interchange APIs, such as the `CUDA Array Interface`_, include members to pass the shape and the data types of the data buffers being exported. This information is necessary to interpret the raw bytes in the -device data buffers that are being shared. Rather than store the -shape / types of the data alongside the ``ArrowDeviceArray``, users -should utilize the existing ``ArrowSchema`` structure to pass any data +device data buffers that are being shared. Rather than store the +shape / types of the data alongside the ``ArrowDeviceArray``, users +should utilize the existing ``ArrowSchema`` structure to pass any data type and shape information. Updating this specification From b55d13c16eb25f3264645e53cc03aa1f7d753b25 Mon Sep 17 00:00:00 2001 From: Ben Harkins <60872452+benibus@users.noreply.github.com> Date: Wed, 15 Nov 2023 06:05:07 -0500 Subject: [PATCH 11/23] GH-36036: [C++][Python][Parquet] Implement Float16 logical type (#36073) ### Rationale for this change There is currently an active proposal to support half-float types in Parquet. For more details/discussion, see the links in this PR's accompanying issue. ### What changes are included in this PR? This PR implements basic support for a `Float16LogicalType` in accordance with the proposed spec. More specifically, this includes: - Changes to `parquet.thrift` and regenerated `parqet_types` files - Basic `LogicalType` class definition, method impls, and enums - Support for specialized comparisons and column statistics In the interest of scope, this PR does not currently deal with arrow integration and byte split encoding - although we will want both of these features resolved before the proposal is approved. ### Are these changes tested? Yes (tests are included) ### Are there any user-facing changes? Yes * Closes: #36036 Lead-authored-by: benibus Co-authored-by: Ben Harkins <60872452+benibus@users.noreply.github.com> Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/float16.cc | 226 ++ cpp/src/arrow/util/float16.h | 209 ++ cpp/src/arrow/util/float16_test.cc | 367 +++ cpp/src/generated/parquet_types.cpp | 2267 +++++++++-------- cpp/src/generated/parquet_types.h | 45 +- .../parquet/arrow/arrow_reader_writer_test.cc | 33 +- cpp/src/parquet/arrow/arrow_schema_test.cc | 7 +- cpp/src/parquet/arrow/reader_internal.cc | 25 + cpp/src/parquet/arrow/schema.cc | 5 + cpp/src/parquet/arrow/schema_internal.cc | 2 + cpp/src/parquet/arrow/test_util.h | 21 +- cpp/src/parquet/column_writer.cc | 30 + cpp/src/parquet/page_index_test.cc | 22 + cpp/src/parquet/parquet.thrift | 2 + cpp/src/parquet/schema_test.cc | 29 +- cpp/src/parquet/statistics.cc | 179 +- cpp/src/parquet/statistics_test.cc | 355 ++- cpp/src/parquet/test_util.cc | 10 + cpp/src/parquet/test_util.h | 4 + cpp/src/parquet/types.cc | 26 + cpp/src/parquet/types.h | 13 + docs/source/cpp/parquet.rst | 2 + 24 files changed, 2670 insertions(+), 1211 deletions(-) create mode 100644 cpp/src/arrow/util/float16.cc create mode 100644 cpp/src/arrow/util/float16.h create mode 100644 cpp/src/arrow/util/float16_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 101b089ba837f..24e8eefad1523 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -223,6 +223,7 @@ set(ARROW_SRCS util/debug.cc util/decimal.cc util/delimiting.cc + util/float16.cc util/formatting.cc util/future.cc util/hashing.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 3dc8eac1abf64..2e9487dcf50c8 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -48,6 +48,7 @@ add_arrow_test(utility-test checked_cast_test.cc compression_test.cc decimal_test.cc + float16_test.cc formatting_util_test.cc key_value_metadata_test.cc hashing_test.cc diff --git a/cpp/src/arrow/util/float16.cc b/cpp/src/arrow/util/float16.cc new file mode 100644 index 0000000000000..5c8b3d10ca0cd --- /dev/null +++ b/cpp/src/arrow/util/float16.cc @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/util/float16.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace util { + +namespace { + +// -------------------------------------------------------- +// Binary conversions +// -------------------------------------------------------- +// These routines are partially adapted from Numpy's C implementation +// +// Some useful metrics for conversions between different precisions: +// |-----------------------------------------| +// | precision | half | single | double | +// |-----------------------------------------| +// | mantissa | 10 bits | 23 bits | 52 bits | +// | exponent | 5 bits | 8 bits | 11 bits | +// | sign | 1 bit | 1 bit | 1 bit | +// | exp bias | 15 | 127 | 1023 | +// |-----------------------------------------| + +template +struct BinaryConverter { + static_assert(std::is_same_v || std::is_same_v); + + static constexpr int kNumBits = sizeof(T) * 8; + static constexpr int kMantNumBits = (kNumBits == 32) ? 23 : 52; + static constexpr int kExpNumBits = kNumBits - kMantNumBits - 1; + + static constexpr int kExpBias = (1 << (kExpNumBits - 1)) - 1; + + static constexpr T kMantMask = (T(1) << kMantNumBits) - 1; + static constexpr T kExpMask = ((T(1) << kExpNumBits) - 1) << kMantNumBits; + static constexpr T kSignMask = T(1) << (kNumBits - 1); + + static_assert(kMantNumBits + kExpNumBits + 1 == kNumBits); + static_assert(kSignMask + kExpMask + kMantMask == ~T(0)); + + static uint16_t ToBinary16(T); + static T FromBinary16(uint16_t); +}; + +// Converts a IEEE binary32/64 into a binary16. Rounds to nearest with ties to zero +template +uint16_t BinaryConverter::ToBinary16(T f_bits) { + // Sign mask for output binary16 + const uint16_t h_sign = uint16_t((f_bits >> (kNumBits - 16)) & 0x8000); + + // Exponent mask for input binary + const T f_exp = f_bits & kExpMask; + // Exponents as signed pre-shifted values for convenience. Here, we need to re-bias the + // exponent for a binary16. If, after re-biasing, the binary16 exponent falls outside of + // the range [1,30] then we need to handle the under/overflow case specially. + const int16_t f_biased_exp = int16_t(f_exp >> kMantNumBits); + const int16_t unbiased_exp = f_biased_exp - kExpBias; + const int16_t h_biased_exp = unbiased_exp + 15; + + // Mantissa mask for input + const T f_mant = f_bits & kMantMask; + + // We define a "rounding bit", which is the most significant bit to be dropped + // (e.g. for a binary32, 0x1000). + constexpr T rounding_bit = T(1) << (kMantNumBits - (10 + 1)); + + // Handle exponent overflow, NaN, and +/-Inf + if (h_biased_exp >= 0x1f) { + // The input is a NaN representation + if (f_exp == kExpMask && f_mant != 0) { + uint16_t h_mant = uint16_t(f_mant >> (kMantNumBits - 10)); + // If the mantissa bit(s) indicating NaN were shifted out, add one back. Otherwise, + // the result would be infinity. + if (h_mant == 0) { + h_mant = 0x1; + } + return uint16_t(h_sign | 0x7c00u | h_mant); + } + + // Clamp to +/-infinity + return uint16_t(h_sign | 0x7c00u); + } + + // Handle exponent underflow, subnormals, and +/-0 + if (h_biased_exp <= 0) { + // If the underflow exceeds the number of bits in a binary16 mantissa (10) then we + // can't round, so just clamp to 0. Note that this also weeds out any input values + // that are subnormal - including +/-0; + if (h_biased_exp < -10) { + return h_sign; + } + + // Convert to a rounded subnormal value starting with the mantissa. Since the input + // input is known to be normal at this point, we need to prepend its implicit leading + // bit - which also necessitates an additional right-shift. + T rounded_mant = (T(1) << kMantNumBits) | f_mant; + rounded_mant >>= (1 - h_biased_exp); + + // Here, we implement rounding to nearest (with ties to even) + // + // By now, our new mantissa has two conceptual ranges: + // - The lower 13 bits, which will be shifted out + // - The upper 10 bits, which will become the binary16's mantissa + // + // "Rounding to nearest" basically just means that we add 1 to the rounding bit. If + // it's set, then the bit will cascade upwards into the 10-bit mantissa (and + // potentially the exponent). The only time where we may NOT do this is when a "tie" + // occurs - i.e. when the rounding bit is set but all of the lower bits are 0. In that + // case, we don't add 1 if the retained mantissa is "even" (its least significant bit + // is 0). + if ((rounded_mant & ((rounding_bit << 2) - 1)) != rounding_bit || + (f_mant & 0x7ffu) != 0) { + rounded_mant += rounding_bit; + } + + const uint16_t h_mant = uint16_t(rounded_mant >> (kMantNumBits - 10)); + return h_sign + h_mant; + } + + const uint16_t h_exp = uint16_t(h_biased_exp) << 10; + + // See comment on rounding behavior above + T rounded_mant = f_mant; + if ((rounded_mant & ((rounding_bit << 2) - 1)) != rounding_bit) { + rounded_mant += rounding_bit; + } + + const uint16_t h_mant = uint16_t(rounded_mant >> (kMantNumBits - 10)); + // Note that we ADD (rather than OR) the components because we want the carryover bit + // from rounding the mantissa to cascade through the exponent (it shouldn't affect the + // sign bit though). + return h_sign + h_exp + h_mant; +} + +// Converts a IEEE binary16 into a binary32/64 +template +T BinaryConverter::FromBinary16(uint16_t h_bits) { + // Sign mask for output + const T f_sign = T(h_bits & 0x8000u) << (kNumBits - 16); + + // Exponent mask for input binary16 + const uint16_t h_exp = h_bits & 0x7c00; + // Mantissa mask for input binary16 + const uint16_t h_mant = h_bits & 0x3ffu; + + switch (h_exp) { + // Handle Inf and NaN + case 0x7c00u: + return f_sign | kExpMask | (T(h_mant) << (kMantNumBits - 10)); + // Handle zeros and subnormals + case 0x0000u: { + // Input is +/-0 + if (h_mant == 0) { + return f_sign; + } + // Subnormal binary16 to normal binary32/64 + // + // Start with an f32/64-biased exponent of 2^-15. We then decrement it until the + // most significant set bit is left-shifted out - as it doesn't get explicitly + // stored in normalized floating point values. Instead, its existence is implied by + // the new exponent. + T f_exp = kExpBias - 15; + T f_mant = T(h_mant) << 1; + while ((f_mant & 0x0400u) == 0) { + --f_exp; + f_mant <<= 1; + } + f_exp <<= kMantNumBits; + f_mant = (f_mant & 0x03ffu) << (kMantNumBits - 10); + return f_sign | f_exp | f_mant; + } break; + // Handle normals + default: + // Equivalent to rebiasing the exponent and shifting everything by the remaining + // mantissa bits. + return f_sign | + ((T(h_bits & 0x7fffu) + (T(kExpBias - 15) << 10)) << (kMantNumBits - 10)); + } +} + +} // namespace + +float Float16::ToFloat() const { + const uint32_t f_bits = BinaryConverter::FromBinary16(bits_); + return SafeCopy(f_bits); +} + +Float16 Float16::FromFloat(float f) { + const uint32_t f_bits = SafeCopy(f); + return FromBits(BinaryConverter::ToBinary16(f_bits)); +} + +double Float16::ToDouble() const { + const uint64_t d_bits = BinaryConverter::FromBinary16(bits_); + return SafeCopy(d_bits); +} + +Float16 Float16::FromDouble(double d) { + const uint64_t d_bits = SafeCopy(d); + return FromBits(BinaryConverter::ToBinary16(d_bits)); +} + +std::ostream& operator<<(std::ostream& os, Float16 arg) { return (os << arg.ToFloat()); } + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/float16.h b/cpp/src/arrow/util/float16.h new file mode 100644 index 0000000000000..0a432fee2cd31 --- /dev/null +++ b/cpp/src/arrow/util/float16.h @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/util/endian.h" +#include "arrow/util/macros.h" +#include "arrow/util/ubsan.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +/// \brief Class representing an IEEE half-precision float, encoded as a `uint16_t` +/// +/// The exact format is as follows (from LSB to MSB): +/// - bits 0-10: mantissa +/// - bits 10-15: exponent +/// - bit 15: sign +/// +class ARROW_EXPORT Float16 { + public: + Float16() = default; + explicit Float16(float f) : Float16(FromFloat(f)) {} + explicit Float16(double d) : Float16(FromDouble(d)) {} + template >* = NULLPTR> + explicit Float16(T v) : Float16(static_cast(v)) {} + + /// \brief Create a `Float16` from its exact binary representation + constexpr static Float16 FromBits(uint16_t bits) { return Float16{bits, bool{}}; } + /// \brief Create a `Float16` from a 32-bit float (may lose precision) + static Float16 FromFloat(float f); + /// \brief Create a `Float16` from a 64-bit float (may lose precision) + static Float16 FromDouble(double d); + + /// \brief Read a `Float16` from memory in native-endian byte order + static Float16 FromBytes(const uint8_t* src) { + return FromBits(SafeLoadAs(src)); + } + + /// \brief Read a `Float16` from memory in little-endian byte order + static Float16 FromLittleEndian(const uint8_t* src) { + return FromBits(::arrow::bit_util::FromLittleEndian(SafeLoadAs(src))); + } + + /// \brief Read a `Float16` from memory in big-endian byte order + static Float16 FromBigEndian(const uint8_t* src) { + return FromBits(::arrow::bit_util::FromBigEndian(SafeLoadAs(src))); + } + + /// \brief Return the value's binary representation as a `uint16_t` + constexpr uint16_t bits() const { return bits_; } + + /// \brief Return true if the value is negative (sign bit is set) + constexpr bool signbit() const { return (bits_ & 0x8000) != 0; } + + /// \brief Return true if the value is NaN + constexpr bool is_nan() const { return (bits_ & 0x7fff) > 0x7c00; } + /// \brief Return true if the value is positive/negative infinity + constexpr bool is_infinity() const { return (bits_ & 0x7fff) == 0x7c00; } + /// \brief Return true if the value is finite and not NaN + constexpr bool is_finite() const { return (bits_ & 0x7c00) != 0x7c00; } + /// \brief Return true if the value is positive/negative zero + constexpr bool is_zero() const { return (bits_ & 0x7fff) == 0; } + + /// \brief Convert to a 32-bit float + float ToFloat() const; + /// \brief Convert to a 64-bit float + double ToDouble() const; + + explicit operator float() const { return ToFloat(); } + explicit operator double() const { return ToDouble(); } + + /// \brief Copy the value's bytes in native-endian byte order + void ToBytes(uint8_t* dest) const { std::memcpy(dest, &bits_, sizeof(bits_)); } + /// \brief Return the value's bytes in native-endian byte order + constexpr std::array ToBytes() const { +#if ARROW_LITTLE_ENDIAN + return ToLittleEndian(); +#else + return ToBigEndian(); +#endif + } + + /// \brief Copy the value's bytes in little-endian byte order + void ToLittleEndian(uint8_t* dest) const { + const auto bytes = ToLittleEndian(); + std::memcpy(dest, bytes.data(), bytes.size()); + } + /// \brief Return the value's bytes in little-endian byte order + constexpr std::array ToLittleEndian() const { +#if ARROW_LITTLE_ENDIAN + return {uint8_t(bits_ & 0xff), uint8_t(bits_ >> 8)}; +#else + return {uint8_t(bits_ >> 8), uint8_t(bits_ & 0xff)}; +#endif + } + + /// \brief Copy the value's bytes in big-endian byte order + void ToBigEndian(uint8_t* dest) const { + const auto bytes = ToBigEndian(); + std::memcpy(dest, bytes.data(), bytes.size()); + } + /// \brief Return the value's bytes in big-endian byte order + constexpr std::array ToBigEndian() const { +#if ARROW_LITTLE_ENDIAN + return {uint8_t(bits_ >> 8), uint8_t(bits_ & 0xff)}; +#else + return {uint8_t(bits_ & 0xff), uint8_t(bits_ >> 8)}; +#endif + } + + constexpr Float16 operator-() const { return FromBits(bits_ ^ 0x8000); } + constexpr Float16 operator+() const { return FromBits(bits_); } + + friend constexpr bool operator==(Float16 lhs, Float16 rhs) { + if (lhs.is_nan() || rhs.is_nan()) return false; + return Float16::CompareEq(lhs, rhs); + } + friend constexpr bool operator!=(Float16 lhs, Float16 rhs) { return !(lhs == rhs); } + + friend constexpr bool operator<(Float16 lhs, Float16 rhs) { + if (lhs.is_nan() || rhs.is_nan()) return false; + return Float16::CompareLt(lhs, rhs); + } + friend constexpr bool operator>(Float16 lhs, Float16 rhs) { return rhs < lhs; } + + friend constexpr bool operator<=(Float16 lhs, Float16 rhs) { + if (lhs.is_nan() || rhs.is_nan()) return false; + return !Float16::CompareLt(rhs, lhs); + } + friend constexpr bool operator>=(Float16 lhs, Float16 rhs) { return rhs <= lhs; } + + ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, Float16 arg); + + protected: + uint16_t bits_; + + private: + constexpr Float16(uint16_t bits, bool) : bits_(bits) {} + + // Comparison helpers that assume neither operand is NaN + static constexpr bool CompareEq(Float16 lhs, Float16 rhs) { + return (lhs.bits() == rhs.bits()) || (lhs.is_zero() && rhs.is_zero()); + } + static constexpr bool CompareLt(Float16 lhs, Float16 rhs) { + if (lhs.signbit()) { + if (rhs.signbit()) { + // Both are negative + return lhs.bits() > rhs.bits(); + } else { + // Handle +/-0 + return !lhs.is_zero() || rhs.bits() != 0; + } + } else if (rhs.signbit()) { + return false; + } else { + // Both are positive + return lhs.bits() < rhs.bits(); + } + } +}; + +static_assert(std::is_trivial_v); + +} // namespace util +} // namespace arrow + +// TODO: Not complete +template <> +class std::numeric_limits { + using T = arrow::util::Float16; + + public: + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + + static constexpr T min() { return T::FromBits(0b0000010000000000); } + static constexpr T max() { return T::FromBits(0b0111101111111111); } + static constexpr T lowest() { return -max(); } + + static constexpr T infinity() { return T::FromBits(0b0111110000000000); } + + static constexpr T quiet_NaN() { return T::FromBits(0b0111111111111111); } +}; diff --git a/cpp/src/arrow/util/float16_test.cc b/cpp/src/arrow/util/float16_test.cc new file mode 100644 index 0000000000000..073375882e3c2 --- /dev/null +++ b/cpp/src/arrow/util/float16_test.cc @@ -0,0 +1,367 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include + +#include "arrow/testing/gtest_util.h" +#include "arrow/util/endian.h" +#include "arrow/util/float16.h" +#include "arrow/util/span.h" +#include "arrow/util/ubsan.h" + +namespace arrow::util { +namespace { + +template +using Limits = std::numeric_limits; + +float F32(uint32_t bits) { return SafeCopy(bits); } +double F64(uint64_t bits) { return SafeCopy(bits); } + +template +class Float16ConversionTest : public ::testing::Test { + public: + struct RoundTripTestCase { + T input; + uint16_t bits; + T output; + }; + + static void TestRoundTrip(span test_cases) { + for (size_t index = 0; index < test_cases.size(); ++index) { + ARROW_SCOPED_TRACE("i=", index); + const auto& tc = test_cases[index]; + + const auto f16 = Float16(tc.input); + EXPECT_EQ(tc.bits, f16.bits()); + EXPECT_EQ(tc.output, static_cast(f16)); + + EXPECT_EQ(std::signbit(tc.output), f16.signbit()); + EXPECT_EQ(std::isnan(tc.output), f16.is_nan()); + EXPECT_EQ(std::isinf(tc.output), f16.is_infinity()); + EXPECT_EQ(std::isfinite(tc.output), f16.is_finite()); + } + } + + static void TestRoundTripFromNaN(span test_cases) { + for (size_t i = 0; i < test_cases.size(); ++i) { + ARROW_SCOPED_TRACE("i=", i); + const auto input = test_cases[i]; + + ASSERT_TRUE(std::isnan(input)); + const bool sign = std::signbit(input); + + const auto f16 = Float16(input); + EXPECT_TRUE(f16.is_nan()); + EXPECT_EQ(std::isinf(input), f16.is_infinity()); + EXPECT_EQ(std::isfinite(input), f16.is_finite()); + EXPECT_EQ(sign, f16.signbit()); + + const auto output = static_cast(f16); + EXPECT_TRUE(std::isnan(output)); + EXPECT_EQ(sign, std::signbit(output)); + } + } + + void TestRoundTripFromInf() { + const T test_cases[] = {+Limits::infinity(), -Limits::infinity()}; + + for (size_t i = 0; i < std::size(test_cases); ++i) { + ARROW_SCOPED_TRACE("i=", i); + const auto input = test_cases[i]; + + ASSERT_TRUE(std::isinf(input)); + const bool sign = std::signbit(input); + + const auto f16 = Float16(input); + EXPECT_TRUE(f16.is_infinity()); + EXPECT_EQ(std::isfinite(input), f16.is_finite()); + EXPECT_EQ(std::isnan(input), f16.is_nan()); + EXPECT_EQ(sign, f16.signbit()); + + const auto output = static_cast(f16); + EXPECT_TRUE(std::isinf(output)); + EXPECT_EQ(sign, std::signbit(output)); + } + } + + void TestRoundTrip(); + void TestRoundTripFromNaN(); +}; + +template <> +void Float16ConversionTest::TestRoundTrip() { + // Expected values were also manually validated with numpy-1.24.3 + const RoundTripTestCase test_cases[] = { + // +/-0.0f + {F32(0x80000000u), 0b1000000000000000u, -0.0f}, + {F32(0x00000000u), 0b0000000000000000u, +0.0f}, + // 32-bit exp is 102 => 2^-25. Rounding to nearest. + {F32(0xb3000001u), 0b1000000000000001u, -5.96046447754e-8f}, + // 32-bit exp is 102 => 2^-25. Rounding to even. + {F32(0xb3000000u), 0b1000000000000000u, -0.0f}, + // 32-bit exp is 101 => 2^-26. Underflow to zero. + {F32(0xb2800001u), 0b1000000000000000u, -0.0f}, + // 32-bit exp is 108 => 2^-19. + {F32(0xb61a0000u), 0b1000000000100110u, -2.26497650146e-6f}, + // 32-bit exp is 108 => 2^-19. + {F32(0xb61e0000u), 0b1000000000101000u, -2.38418579102e-6f}, + // 32-bit exp is 112 => 2^-15. Rounding to nearest. + {F32(0xb87fa001u), 0b1000001111111111u, -6.09755516052e-5f}, + // 32-bit exp is 112 => 2^-15. Rounds to 16-bit exp of 1 => 2^-14 + {F32(0xb87fe001u), 0b1000010000000000u, -6.103515625e-5f}, + // 32-bit exp is 142 => 2^15. Rounding to nearest. + {F32(0xc7001001u), 0b1111100000000001u, -32800.0f}, + // 32-bit exp is 142 => 2^15. Rounding to even. + {F32(0xc7001000u), 0b1111100000000000u, -32768.0f}, + // 65520.0f rounds to inf + {F32(0x477ff000u), 0b0111110000000000u, Limits::infinity()}, + // 65488.0039062f rounds to 65504.0 (float16 max) + {F32(0x477fd001u), 0b0111101111111111u, 65504.0f}, + // 32-bit exp is 127 => 2^0, rounds to 16-bit exp of 16 => 2^1. + {F32(0xbffff000u), 0b1100000000000000u, -2.0f}, + // Extreme values should safely clamp to +/-inf + {Limits::max(), 0b0111110000000000u, +Limits::infinity()}, + {Limits::lowest(), 0b1111110000000000u, -Limits::infinity()}, + }; + + TestRoundTrip(span(test_cases, std::size(test_cases))); +} + +template <> +void Float16ConversionTest::TestRoundTrip() { + // Expected values were also manually validated with numpy-1.24.3 + const RoundTripTestCase test_cases[] = { + // +/-0.0 + {F64(0x8000000000000000u), 0b1000000000000000u, -0.0}, + {F64(0x0000000000000000u), 0b0000000000000000u, +0.0}, + // 64-bit exp is 998 => 2^-25. Rounding to nearest. + {F64(0xbe60000000000001u), 0b1000000000000001u, -5.9604644775390625e-8}, + // 64-bit exp is 998 => 2^-25. Rounding to even. + {F64(0xbe60000000000000u), 0b1000000000000000u, -0.0}, + // 64-bit exp is 997 => 2^-26. Underflow to zero. + {F64(0xbe50000000000001u), 0b1000000000000000u, -0.0}, + // 64-bit exp is 1004 => 2^-19. + {F64(0xbec3400000000000u), 0b1000000000100110u, -2.2649765014648438e-6}, + // 64-bit exp is 1004 => 2^-19. + {F64(0xbec3c00000000000u), 0b1000000000101000u, -2.3841857910156250e-6}, + // 64-bit exp is 1008 => 2^-15. Rounding to nearest. + {F64(0xbf0ff40000000001u), 0b1000001111111111u, -6.0975551605224609e-5}, + // 64-bit exp is 1008 => 2^-15. Rounds to 16-bit exp of 1 => 2^-14 + {F64(0xbf0ffc0000000001u), 0b1000010000000000u, -6.1035156250000000e-5}, + // 64-bit exp is 1038 => 2^15. Rounding to nearest. + {F64(0xc0e0020000000001u), 0b1111100000000001u, -32800.0}, + // 64-bit exp is 1038 => 2^15. Rounding to even. + {F64(0xc0e0020000000000u), 0b1111100000000000u, -32768.0}, + // 65520.0 rounds to inf + {F64(0x40effe0000000000u), 0b0111110000000000u, Limits::infinity()}, + // 65488.00000000001 rounds to 65504.0 (float16 max) + {F64(0x40effa0000000001u), 0b0111101111111111u, 65504.0}, + // 64-bit exp is 1023 => 2^0, rounds to 16-bit exp of 16 => 2^1. + {F64(0xbffffe0000000000u), 0b1100000000000000u, -2.0}, + // Extreme values should safely clamp to +/-inf + {Limits::max(), 0b0111110000000000u, +Limits::infinity()}, + {Limits::lowest(), 0b1111110000000000u, -Limits::infinity()}, + }; + + TestRoundTrip(span(test_cases, std::size(test_cases))); +} + +template <> +void Float16ConversionTest::TestRoundTripFromNaN() { + const float test_cases[] = { + Limits::quiet_NaN(), F32(0x7f800001u), F32(0xff800001u), F32(0x7fc00000u), + F32(0xffc00000u), F32(0x7fffffffu), F32(0xffffffffu)}; + TestRoundTripFromNaN(span(test_cases, std::size(test_cases))); +} + +template <> +void Float16ConversionTest::TestRoundTripFromNaN() { + const double test_cases[] = {Limits::quiet_NaN(), F64(0x7ff0000000000001u), + F64(0xfff0000000000001u), F64(0x7ff8000000000000u), + F64(0xfff8000000000000u), F64(0x7fffffffffffffffu), + F64(0xffffffffffffffffu)}; + TestRoundTripFromNaN(span(test_cases, std::size(test_cases))); +} + +using NativeFloatTypes = ::testing::Types; + +TYPED_TEST_SUITE(Float16ConversionTest, NativeFloatTypes); + +TYPED_TEST(Float16ConversionTest, RoundTrip) { this->TestRoundTrip(); } +TYPED_TEST(Float16ConversionTest, RoundTripFromNaN) { this->TestRoundTripFromNaN(); } +TYPED_TEST(Float16ConversionTest, RoundTripFromInf) { this->TestRoundTripFromInf(); } + +TEST(Float16Test, ConstexprFunctions) { + constexpr auto a = Float16::FromBits(0xbc00); // -1.0 + constexpr auto b = Float16::FromBits(0x3c00); // +1.0 + + static_assert(a.bits() == 0xbc00); + static_assert(a.signbit() == true); + static_assert(a.is_nan() == false); + static_assert(a.is_infinity() == false); + static_assert(a.is_finite() == true); + static_assert(a.is_zero() == false); + + static_assert((a == b) == false); + static_assert((a != b) == true); + static_assert((a < b) == true); + static_assert((a > b) == false); + static_assert((a <= b) == true); + static_assert((a >= b) == false); + static_assert(-a == +b); + + constexpr auto v = Float16::FromBits(0xffff); + static_assert(v.ToBytes()[0] == 0xff); + static_assert(v.ToLittleEndian()[0] == 0xff); + static_assert(v.ToBigEndian()[0] == 0xff); +} + +TEST(Float16Test, Constructors) { + // Construction from exact bits + ASSERT_EQ(1, Float16::FromBits(1).bits()); + // Construction from floating point (including implicit conversions) + int i = 0; + for (auto f16 : {Float16(1.0f), Float16(1.0), Float16(1)}) { + ARROW_SCOPED_TRACE("i=", i++); + ASSERT_EQ(0x3c00, f16.bits()); + } +} + +TEST(Float16Test, Compare) { + constexpr float f32_inf = Limits::infinity(); + constexpr float f32_nan = Limits::quiet_NaN(); + + const struct { + Float16 f16; + float f32; + } test_values[] = { + {Limits::min(), +6.103515625e-05f}, + {Limits::max(), +65504.0f}, + {Limits::lowest(), -65504.0f}, + {+Limits::infinity(), +f32_inf}, + {-Limits::infinity(), -f32_inf}, + // Multiple (semantically equivalent) NaN representations + {Float16::FromBits(0x7e00), f32_nan}, + {Float16::FromBits(0xfe00), f32_nan}, + {Float16::FromBits(0x7fff), f32_nan}, + {Float16::FromBits(0xffff), f32_nan}, + // Positive/negative zeros + {Float16::FromBits(0x0000), +0.0f}, + {Float16::FromBits(0x8000), -0.0f}, + // Miscellaneous values. In general, they're chosen to test the sign/exponent and + // exponent/mantissa boundaries + {Float16::FromBits(0x101c), +0.00050163269043f}, + {Float16::FromBits(0x901c), -0.00050163269043f}, + {Float16::FromBits(0x101d), +0.000502109527588f}, + {Float16::FromBits(0x901d), -0.000502109527588f}, + {Float16::FromBits(0x121c), +0.00074577331543f}, + {Float16::FromBits(0x921c), -0.00074577331543f}, + {Float16::FromBits(0x141c), +0.00100326538086f}, + {Float16::FromBits(0x941c), -0.00100326538086f}, + {Float16::FromBits(0x501c), +32.875f}, + {Float16::FromBits(0xd01c), -32.875f}, + // A few subnormals for good measure + {Float16::FromBits(0x001c), +1.66893005371e-06f}, + {Float16::FromBits(0x801c), -1.66893005371e-06f}, + {Float16::FromBits(0x021c), +3.21865081787e-05f}, + {Float16::FromBits(0x821c), -3.21865081787e-05f}, + }; + + auto expect_op = [&](std::string op_name, auto op) { + ARROW_SCOPED_TRACE(op_name); + const auto num_values = static_cast(std::size(test_values)); + + // Check all combinations of operands in both directions + for (int i = 0; i < num_values; ++i) { + for (int j = 0; j < num_values; ++j) { + auto [a16, a32] = test_values[i]; + auto [b16, b32] = test_values[j]; + ARROW_SCOPED_TRACE("[", i, ",", j, "] = ", a16, ",", b16); + + // Results for float16 and float32 should be the same + ASSERT_EQ(op(a16, b16), op(a32, b32)); + } + } + }; + + // Verify that our "equivalent" 16/32-bit values actually are + for (const auto& v : test_values) { + if (std::isnan(v.f32)) { + ASSERT_TRUE(std::isnan(v.f16.ToFloat())); + } else { + ASSERT_EQ(v.f32, v.f16.ToFloat()); + } + } + + expect_op("equal", [](auto l, auto r) { return l == r; }); + expect_op("not_equal", [](auto l, auto r) { return l != r; }); + expect_op("less", [](auto l, auto r) { return l < r; }); + expect_op("greater", [](auto l, auto r) { return l > r; }); + expect_op("less_equal", [](auto l, auto r) { return l <= r; }); + expect_op("greater_equal", [](auto l, auto r) { return l >= r; }); +} + +TEST(Float16Test, ToBytes) { + constexpr auto f16 = Float16::FromBits(0xd01c); + std::array bytes; + auto load = [&bytes]() { return SafeLoadAs(bytes.data()); }; + + // Test native-endian + f16.ToBytes(bytes.data()); + ASSERT_EQ(load(), 0xd01c); + bytes = f16.ToBytes(); + ASSERT_EQ(load(), 0xd01c); + +#if ARROW_LITTLE_ENDIAN + constexpr uint16_t expected_le = 0xd01c; + constexpr uint16_t expected_be = 0x1cd0; +#else + constexpr uint16_t expected_le = 0x1cd0; + constexpr uint16_t expected_be = 0xd01c; +#endif + // Test little-endian + f16.ToLittleEndian(bytes.data()); + ASSERT_EQ(load(), expected_le); + bytes = f16.ToLittleEndian(); + ASSERT_EQ(load(), expected_le); + // Test big-endian + f16.ToBigEndian(bytes.data()); + ASSERT_EQ(load(), expected_be); + bytes = f16.ToBigEndian(); + ASSERT_EQ(load(), expected_be); +} + +TEST(Float16Test, FromBytes) { + constexpr uint16_t u16 = 0xd01c; + const auto* data = reinterpret_cast(&u16); + ASSERT_EQ(Float16::FromBytes(data), Float16::FromBits(0xd01c)); +#if ARROW_LITTLE_ENDIAN + ASSERT_EQ(Float16::FromLittleEndian(data), Float16::FromBits(0xd01c)); + ASSERT_EQ(Float16::FromBigEndian(data), Float16::FromBits(0x1cd0)); +#else + ASSERT_EQ(Float16::FromLittleEndian(data), Float16(0x1cd0)); + ASSERT_EQ(Float16::FromBigEndian(data), Float16(0xd01c)); +#endif +} + +} // namespace +} // namespace arrow::util diff --git a/cpp/src/generated/parquet_types.cpp b/cpp/src/generated/parquet_types.cpp index f4e378fd3822a..86188581e0c42 100644 --- a/cpp/src/generated/parquet_types.cpp +++ b/cpp/src/generated/parquet_types.cpp @@ -1288,6 +1288,81 @@ void DateType::printTo(std::ostream& out) const { } +Float16Type::~Float16Type() noexcept { +} + +std::ostream& operator<<(std::ostream& out, const Float16Type& obj) +{ + obj.printTo(out); + return out; +} + + +uint32_t Float16Type::read(::apache::thrift::protocol::TProtocol* iprot) { + + ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot); + uint32_t xfer = 0; + std::string fname; + ::apache::thrift::protocol::TType ftype; + int16_t fid; + + xfer += iprot->readStructBegin(fname); + + using ::apache::thrift::protocol::TProtocolException; + + + while (true) + { + xfer += iprot->readFieldBegin(fname, ftype, fid); + if (ftype == ::apache::thrift::protocol::T_STOP) { + break; + } + xfer += iprot->skip(ftype); + xfer += iprot->readFieldEnd(); + } + + xfer += iprot->readStructEnd(); + + return xfer; +} + +uint32_t Float16Type::write(::apache::thrift::protocol::TProtocol* oprot) const { + uint32_t xfer = 0; + ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot); + xfer += oprot->writeStructBegin("Float16Type"); + + xfer += oprot->writeFieldStop(); + xfer += oprot->writeStructEnd(); + return xfer; +} + +void swap(Float16Type &a, Float16Type &b) { + using ::std::swap; + (void) a; + (void) b; +} + +Float16Type::Float16Type(const Float16Type& other28) noexcept { + (void) other28; +} +Float16Type::Float16Type(Float16Type&& other29) noexcept { + (void) other29; +} +Float16Type& Float16Type::operator=(const Float16Type& other30) noexcept { + (void) other30; + return *this; +} +Float16Type& Float16Type::operator=(Float16Type&& other31) noexcept { + (void) other31; + return *this; +} +void Float16Type::printTo(std::ostream& out) const { + using ::apache::thrift::to_string; + out << "Float16Type("; + out << ")"; +} + + NullType::~NullType() noexcept { } @@ -1342,18 +1417,18 @@ void swap(NullType &a, NullType &b) { (void) b; } -NullType::NullType(const NullType& other28) noexcept { - (void) other28; +NullType::NullType(const NullType& other32) noexcept { + (void) other32; } -NullType::NullType(NullType&& other29) noexcept { - (void) other29; +NullType::NullType(NullType&& other33) noexcept { + (void) other33; } -NullType& NullType::operator=(const NullType& other30) noexcept { - (void) other30; +NullType& NullType::operator=(const NullType& other34) noexcept { + (void) other34; return *this; } -NullType& NullType::operator=(NullType&& other31) noexcept { - (void) other31; +NullType& NullType::operator=(NullType&& other35) noexcept { + (void) other35; return *this; } void NullType::printTo(std::ostream& out) const { @@ -1460,22 +1535,22 @@ void swap(DecimalType &a, DecimalType &b) { swap(a.precision, b.precision); } -DecimalType::DecimalType(const DecimalType& other32) noexcept { - scale = other32.scale; - precision = other32.precision; +DecimalType::DecimalType(const DecimalType& other36) noexcept { + scale = other36.scale; + precision = other36.precision; } -DecimalType::DecimalType(DecimalType&& other33) noexcept { - scale = other33.scale; - precision = other33.precision; +DecimalType::DecimalType(DecimalType&& other37) noexcept { + scale = other37.scale; + precision = other37.precision; } -DecimalType& DecimalType::operator=(const DecimalType& other34) noexcept { - scale = other34.scale; - precision = other34.precision; +DecimalType& DecimalType::operator=(const DecimalType& other38) noexcept { + scale = other38.scale; + precision = other38.precision; return *this; } -DecimalType& DecimalType::operator=(DecimalType&& other35) noexcept { - scale = other35.scale; - precision = other35.precision; +DecimalType& DecimalType::operator=(DecimalType&& other39) noexcept { + scale = other39.scale; + precision = other39.precision; return *this; } void DecimalType::printTo(std::ostream& out) const { @@ -1541,18 +1616,18 @@ void swap(MilliSeconds &a, MilliSeconds &b) { (void) b; } -MilliSeconds::MilliSeconds(const MilliSeconds& other36) noexcept { - (void) other36; +MilliSeconds::MilliSeconds(const MilliSeconds& other40) noexcept { + (void) other40; } -MilliSeconds::MilliSeconds(MilliSeconds&& other37) noexcept { - (void) other37; +MilliSeconds::MilliSeconds(MilliSeconds&& other41) noexcept { + (void) other41; } -MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other38) noexcept { - (void) other38; +MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other42) noexcept { + (void) other42; return *this; } -MilliSeconds& MilliSeconds::operator=(MilliSeconds&& other39) noexcept { - (void) other39; +MilliSeconds& MilliSeconds::operator=(MilliSeconds&& other43) noexcept { + (void) other43; return *this; } void MilliSeconds::printTo(std::ostream& out) const { @@ -1616,18 +1691,18 @@ void swap(MicroSeconds &a, MicroSeconds &b) { (void) b; } -MicroSeconds::MicroSeconds(const MicroSeconds& other40) noexcept { - (void) other40; +MicroSeconds::MicroSeconds(const MicroSeconds& other44) noexcept { + (void) other44; } -MicroSeconds::MicroSeconds(MicroSeconds&& other41) noexcept { - (void) other41; +MicroSeconds::MicroSeconds(MicroSeconds&& other45) noexcept { + (void) other45; } -MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other42) noexcept { - (void) other42; +MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other46) noexcept { + (void) other46; return *this; } -MicroSeconds& MicroSeconds::operator=(MicroSeconds&& other43) noexcept { - (void) other43; +MicroSeconds& MicroSeconds::operator=(MicroSeconds&& other47) noexcept { + (void) other47; return *this; } void MicroSeconds::printTo(std::ostream& out) const { @@ -1691,18 +1766,18 @@ void swap(NanoSeconds &a, NanoSeconds &b) { (void) b; } -NanoSeconds::NanoSeconds(const NanoSeconds& other44) noexcept { - (void) other44; +NanoSeconds::NanoSeconds(const NanoSeconds& other48) noexcept { + (void) other48; } -NanoSeconds::NanoSeconds(NanoSeconds&& other45) noexcept { - (void) other45; +NanoSeconds::NanoSeconds(NanoSeconds&& other49) noexcept { + (void) other49; } -NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other46) noexcept { - (void) other46; +NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other50) noexcept { + (void) other50; return *this; } -NanoSeconds& NanoSeconds::operator=(NanoSeconds&& other47) noexcept { - (void) other47; +NanoSeconds& NanoSeconds::operator=(NanoSeconds&& other51) noexcept { + (void) other51; return *this; } void NanoSeconds::printTo(std::ostream& out) const { @@ -1827,30 +1902,30 @@ void swap(TimeUnit &a, TimeUnit &b) { swap(a.__isset, b.__isset); } -TimeUnit::TimeUnit(const TimeUnit& other48) noexcept { - MILLIS = other48.MILLIS; - MICROS = other48.MICROS; - NANOS = other48.NANOS; - __isset = other48.__isset; +TimeUnit::TimeUnit(const TimeUnit& other52) noexcept { + MILLIS = other52.MILLIS; + MICROS = other52.MICROS; + NANOS = other52.NANOS; + __isset = other52.__isset; } -TimeUnit::TimeUnit(TimeUnit&& other49) noexcept { - MILLIS = std::move(other49.MILLIS); - MICROS = std::move(other49.MICROS); - NANOS = std::move(other49.NANOS); - __isset = other49.__isset; +TimeUnit::TimeUnit(TimeUnit&& other53) noexcept { + MILLIS = std::move(other53.MILLIS); + MICROS = std::move(other53.MICROS); + NANOS = std::move(other53.NANOS); + __isset = other53.__isset; } -TimeUnit& TimeUnit::operator=(const TimeUnit& other50) noexcept { - MILLIS = other50.MILLIS; - MICROS = other50.MICROS; - NANOS = other50.NANOS; - __isset = other50.__isset; +TimeUnit& TimeUnit::operator=(const TimeUnit& other54) noexcept { + MILLIS = other54.MILLIS; + MICROS = other54.MICROS; + NANOS = other54.NANOS; + __isset = other54.__isset; return *this; } -TimeUnit& TimeUnit::operator=(TimeUnit&& other51) noexcept { - MILLIS = std::move(other51.MILLIS); - MICROS = std::move(other51.MICROS); - NANOS = std::move(other51.NANOS); - __isset = other51.__isset; +TimeUnit& TimeUnit::operator=(TimeUnit&& other55) noexcept { + MILLIS = std::move(other55.MILLIS); + MICROS = std::move(other55.MICROS); + NANOS = std::move(other55.NANOS); + __isset = other55.__isset; return *this; } void TimeUnit::printTo(std::ostream& out) const { @@ -1960,22 +2035,22 @@ void swap(TimestampType &a, TimestampType &b) { swap(a.unit, b.unit); } -TimestampType::TimestampType(const TimestampType& other52) noexcept { - isAdjustedToUTC = other52.isAdjustedToUTC; - unit = other52.unit; +TimestampType::TimestampType(const TimestampType& other56) noexcept { + isAdjustedToUTC = other56.isAdjustedToUTC; + unit = other56.unit; } -TimestampType::TimestampType(TimestampType&& other53) noexcept { - isAdjustedToUTC = other53.isAdjustedToUTC; - unit = std::move(other53.unit); +TimestampType::TimestampType(TimestampType&& other57) noexcept { + isAdjustedToUTC = other57.isAdjustedToUTC; + unit = std::move(other57.unit); } -TimestampType& TimestampType::operator=(const TimestampType& other54) noexcept { - isAdjustedToUTC = other54.isAdjustedToUTC; - unit = other54.unit; +TimestampType& TimestampType::operator=(const TimestampType& other58) noexcept { + isAdjustedToUTC = other58.isAdjustedToUTC; + unit = other58.unit; return *this; } -TimestampType& TimestampType::operator=(TimestampType&& other55) noexcept { - isAdjustedToUTC = other55.isAdjustedToUTC; - unit = std::move(other55.unit); +TimestampType& TimestampType::operator=(TimestampType&& other59) noexcept { + isAdjustedToUTC = other59.isAdjustedToUTC; + unit = std::move(other59.unit); return *this; } void TimestampType::printTo(std::ostream& out) const { @@ -2084,22 +2159,22 @@ void swap(TimeType &a, TimeType &b) { swap(a.unit, b.unit); } -TimeType::TimeType(const TimeType& other56) noexcept { - isAdjustedToUTC = other56.isAdjustedToUTC; - unit = other56.unit; +TimeType::TimeType(const TimeType& other60) noexcept { + isAdjustedToUTC = other60.isAdjustedToUTC; + unit = other60.unit; } -TimeType::TimeType(TimeType&& other57) noexcept { - isAdjustedToUTC = other57.isAdjustedToUTC; - unit = std::move(other57.unit); +TimeType::TimeType(TimeType&& other61) noexcept { + isAdjustedToUTC = other61.isAdjustedToUTC; + unit = std::move(other61.unit); } -TimeType& TimeType::operator=(const TimeType& other58) noexcept { - isAdjustedToUTC = other58.isAdjustedToUTC; - unit = other58.unit; +TimeType& TimeType::operator=(const TimeType& other62) noexcept { + isAdjustedToUTC = other62.isAdjustedToUTC; + unit = other62.unit; return *this; } -TimeType& TimeType::operator=(TimeType&& other59) noexcept { - isAdjustedToUTC = other59.isAdjustedToUTC; - unit = std::move(other59.unit); +TimeType& TimeType::operator=(TimeType&& other63) noexcept { + isAdjustedToUTC = other63.isAdjustedToUTC; + unit = std::move(other63.unit); return *this; } void TimeType::printTo(std::ostream& out) const { @@ -2208,22 +2283,22 @@ void swap(IntType &a, IntType &b) { swap(a.isSigned, b.isSigned); } -IntType::IntType(const IntType& other60) noexcept { - bitWidth = other60.bitWidth; - isSigned = other60.isSigned; +IntType::IntType(const IntType& other64) noexcept { + bitWidth = other64.bitWidth; + isSigned = other64.isSigned; } -IntType::IntType(IntType&& other61) noexcept { - bitWidth = other61.bitWidth; - isSigned = other61.isSigned; +IntType::IntType(IntType&& other65) noexcept { + bitWidth = other65.bitWidth; + isSigned = other65.isSigned; } -IntType& IntType::operator=(const IntType& other62) noexcept { - bitWidth = other62.bitWidth; - isSigned = other62.isSigned; +IntType& IntType::operator=(const IntType& other66) noexcept { + bitWidth = other66.bitWidth; + isSigned = other66.isSigned; return *this; } -IntType& IntType::operator=(IntType&& other63) noexcept { - bitWidth = other63.bitWidth; - isSigned = other63.isSigned; +IntType& IntType::operator=(IntType&& other67) noexcept { + bitWidth = other67.bitWidth; + isSigned = other67.isSigned; return *this; } void IntType::printTo(std::ostream& out) const { @@ -2289,18 +2364,18 @@ void swap(JsonType &a, JsonType &b) { (void) b; } -JsonType::JsonType(const JsonType& other64) noexcept { - (void) other64; +JsonType::JsonType(const JsonType& other68) noexcept { + (void) other68; } -JsonType::JsonType(JsonType&& other65) noexcept { - (void) other65; +JsonType::JsonType(JsonType&& other69) noexcept { + (void) other69; } -JsonType& JsonType::operator=(const JsonType& other66) noexcept { - (void) other66; +JsonType& JsonType::operator=(const JsonType& other70) noexcept { + (void) other70; return *this; } -JsonType& JsonType::operator=(JsonType&& other67) noexcept { - (void) other67; +JsonType& JsonType::operator=(JsonType&& other71) noexcept { + (void) other71; return *this; } void JsonType::printTo(std::ostream& out) const { @@ -2364,18 +2439,18 @@ void swap(BsonType &a, BsonType &b) { (void) b; } -BsonType::BsonType(const BsonType& other68) noexcept { - (void) other68; +BsonType::BsonType(const BsonType& other72) noexcept { + (void) other72; } -BsonType::BsonType(BsonType&& other69) noexcept { - (void) other69; +BsonType::BsonType(BsonType&& other73) noexcept { + (void) other73; } -BsonType& BsonType::operator=(const BsonType& other70) noexcept { - (void) other70; +BsonType& BsonType::operator=(const BsonType& other74) noexcept { + (void) other74; return *this; } -BsonType& BsonType::operator=(BsonType&& other71) noexcept { - (void) other71; +BsonType& BsonType::operator=(BsonType&& other75) noexcept { + (void) other75; return *this; } void BsonType::printTo(std::ostream& out) const { @@ -2453,6 +2528,11 @@ void LogicalType::__set_UUID(const UUIDType& val) { this->UUID = val; __isset.UUID = true; } + +void LogicalType::__set_FLOAT16(const Float16Type& val) { + this->FLOAT16 = val; +__isset.FLOAT16 = true; +} std::ostream& operator<<(std::ostream& out, const LogicalType& obj) { obj.printTo(out); @@ -2585,6 +2665,14 @@ uint32_t LogicalType::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 15: + if (ftype == ::apache::thrift::protocol::T_STRUCT) { + xfer += this->FLOAT16.read(iprot); + this->__isset.FLOAT16 = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -2667,6 +2755,11 @@ uint32_t LogicalType::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += this->UUID.write(oprot); xfer += oprot->writeFieldEnd(); } + if (this->__isset.FLOAT16) { + xfer += oprot->writeFieldBegin("FLOAT16", ::apache::thrift::protocol::T_STRUCT, 15); + xfer += this->FLOAT16.write(oprot); + xfer += oprot->writeFieldEnd(); + } xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; @@ -2687,73 +2780,78 @@ void swap(LogicalType &a, LogicalType &b) { swap(a.JSON, b.JSON); swap(a.BSON, b.BSON); swap(a.UUID, b.UUID); + swap(a.FLOAT16, b.FLOAT16); swap(a.__isset, b.__isset); } -LogicalType::LogicalType(const LogicalType& other72) noexcept { - STRING = other72.STRING; - MAP = other72.MAP; - LIST = other72.LIST; - ENUM = other72.ENUM; - DECIMAL = other72.DECIMAL; - DATE = other72.DATE; - TIME = other72.TIME; - TIMESTAMP = other72.TIMESTAMP; - INTEGER = other72.INTEGER; - UNKNOWN = other72.UNKNOWN; - JSON = other72.JSON; - BSON = other72.BSON; - UUID = other72.UUID; - __isset = other72.__isset; -} -LogicalType::LogicalType(LogicalType&& other73) noexcept { - STRING = std::move(other73.STRING); - MAP = std::move(other73.MAP); - LIST = std::move(other73.LIST); - ENUM = std::move(other73.ENUM); - DECIMAL = std::move(other73.DECIMAL); - DATE = std::move(other73.DATE); - TIME = std::move(other73.TIME); - TIMESTAMP = std::move(other73.TIMESTAMP); - INTEGER = std::move(other73.INTEGER); - UNKNOWN = std::move(other73.UNKNOWN); - JSON = std::move(other73.JSON); - BSON = std::move(other73.BSON); - UUID = std::move(other73.UUID); - __isset = other73.__isset; -} -LogicalType& LogicalType::operator=(const LogicalType& other74) noexcept { - STRING = other74.STRING; - MAP = other74.MAP; - LIST = other74.LIST; - ENUM = other74.ENUM; - DECIMAL = other74.DECIMAL; - DATE = other74.DATE; - TIME = other74.TIME; - TIMESTAMP = other74.TIMESTAMP; - INTEGER = other74.INTEGER; - UNKNOWN = other74.UNKNOWN; - JSON = other74.JSON; - BSON = other74.BSON; - UUID = other74.UUID; - __isset = other74.__isset; +LogicalType::LogicalType(const LogicalType& other76) noexcept { + STRING = other76.STRING; + MAP = other76.MAP; + LIST = other76.LIST; + ENUM = other76.ENUM; + DECIMAL = other76.DECIMAL; + DATE = other76.DATE; + TIME = other76.TIME; + TIMESTAMP = other76.TIMESTAMP; + INTEGER = other76.INTEGER; + UNKNOWN = other76.UNKNOWN; + JSON = other76.JSON; + BSON = other76.BSON; + UUID = other76.UUID; + FLOAT16 = other76.FLOAT16; + __isset = other76.__isset; +} +LogicalType::LogicalType(LogicalType&& other77) noexcept { + STRING = std::move(other77.STRING); + MAP = std::move(other77.MAP); + LIST = std::move(other77.LIST); + ENUM = std::move(other77.ENUM); + DECIMAL = std::move(other77.DECIMAL); + DATE = std::move(other77.DATE); + TIME = std::move(other77.TIME); + TIMESTAMP = std::move(other77.TIMESTAMP); + INTEGER = std::move(other77.INTEGER); + UNKNOWN = std::move(other77.UNKNOWN); + JSON = std::move(other77.JSON); + BSON = std::move(other77.BSON); + UUID = std::move(other77.UUID); + FLOAT16 = std::move(other77.FLOAT16); + __isset = other77.__isset; +} +LogicalType& LogicalType::operator=(const LogicalType& other78) noexcept { + STRING = other78.STRING; + MAP = other78.MAP; + LIST = other78.LIST; + ENUM = other78.ENUM; + DECIMAL = other78.DECIMAL; + DATE = other78.DATE; + TIME = other78.TIME; + TIMESTAMP = other78.TIMESTAMP; + INTEGER = other78.INTEGER; + UNKNOWN = other78.UNKNOWN; + JSON = other78.JSON; + BSON = other78.BSON; + UUID = other78.UUID; + FLOAT16 = other78.FLOAT16; + __isset = other78.__isset; return *this; } -LogicalType& LogicalType::operator=(LogicalType&& other75) noexcept { - STRING = std::move(other75.STRING); - MAP = std::move(other75.MAP); - LIST = std::move(other75.LIST); - ENUM = std::move(other75.ENUM); - DECIMAL = std::move(other75.DECIMAL); - DATE = std::move(other75.DATE); - TIME = std::move(other75.TIME); - TIMESTAMP = std::move(other75.TIMESTAMP); - INTEGER = std::move(other75.INTEGER); - UNKNOWN = std::move(other75.UNKNOWN); - JSON = std::move(other75.JSON); - BSON = std::move(other75.BSON); - UUID = std::move(other75.UUID); - __isset = other75.__isset; +LogicalType& LogicalType::operator=(LogicalType&& other79) noexcept { + STRING = std::move(other79.STRING); + MAP = std::move(other79.MAP); + LIST = std::move(other79.LIST); + ENUM = std::move(other79.ENUM); + DECIMAL = std::move(other79.DECIMAL); + DATE = std::move(other79.DATE); + TIME = std::move(other79.TIME); + TIMESTAMP = std::move(other79.TIMESTAMP); + INTEGER = std::move(other79.INTEGER); + UNKNOWN = std::move(other79.UNKNOWN); + JSON = std::move(other79.JSON); + BSON = std::move(other79.BSON); + UUID = std::move(other79.UUID); + FLOAT16 = std::move(other79.FLOAT16); + __isset = other79.__isset; return *this; } void LogicalType::printTo(std::ostream& out) const { @@ -2772,6 +2870,7 @@ void LogicalType::printTo(std::ostream& out) const { out << ", " << "JSON="; (__isset.JSON ? (out << to_string(JSON)) : (out << "")); out << ", " << "BSON="; (__isset.BSON ? (out << to_string(BSON)) : (out << "")); out << ", " << "UUID="; (__isset.UUID ? (out << to_string(UUID)) : (out << "")); + out << ", " << "FLOAT16="; (__isset.FLOAT16 ? (out << to_string(FLOAT16)) : (out << "")); out << ")"; } @@ -2859,9 +2958,9 @@ uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast76; - xfer += iprot->readI32(ecast76); - this->type = static_cast(ecast76); + int32_t ecast80; + xfer += iprot->readI32(ecast80); + this->type = static_cast(ecast80); this->__isset.type = true; } else { xfer += iprot->skip(ftype); @@ -2877,9 +2976,9 @@ uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 3: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast77; - xfer += iprot->readI32(ecast77); - this->repetition_type = static_cast(ecast77); + int32_t ecast81; + xfer += iprot->readI32(ecast81); + this->repetition_type = static_cast(ecast81); this->__isset.repetition_type = true; } else { xfer += iprot->skip(ftype); @@ -2903,9 +3002,9 @@ uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 6: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast78; - xfer += iprot->readI32(ecast78); - this->converted_type = static_cast(ecast78); + int32_t ecast82; + xfer += iprot->readI32(ecast82); + this->converted_type = static_cast(ecast82); this->__isset.converted_type = true; } else { xfer += iprot->skip(ftype); @@ -3031,58 +3130,58 @@ void swap(SchemaElement &a, SchemaElement &b) { swap(a.__isset, b.__isset); } -SchemaElement::SchemaElement(const SchemaElement& other79) { - type = other79.type; - type_length = other79.type_length; - repetition_type = other79.repetition_type; - name = other79.name; - num_children = other79.num_children; - converted_type = other79.converted_type; - scale = other79.scale; - precision = other79.precision; - field_id = other79.field_id; - logicalType = other79.logicalType; - __isset = other79.__isset; -} -SchemaElement::SchemaElement(SchemaElement&& other80) noexcept { - type = other80.type; - type_length = other80.type_length; - repetition_type = other80.repetition_type; - name = std::move(other80.name); - num_children = other80.num_children; - converted_type = other80.converted_type; - scale = other80.scale; - precision = other80.precision; - field_id = other80.field_id; - logicalType = std::move(other80.logicalType); - __isset = other80.__isset; -} -SchemaElement& SchemaElement::operator=(const SchemaElement& other81) { - type = other81.type; - type_length = other81.type_length; - repetition_type = other81.repetition_type; - name = other81.name; - num_children = other81.num_children; - converted_type = other81.converted_type; - scale = other81.scale; - precision = other81.precision; - field_id = other81.field_id; - logicalType = other81.logicalType; - __isset = other81.__isset; +SchemaElement::SchemaElement(const SchemaElement& other83) { + type = other83.type; + type_length = other83.type_length; + repetition_type = other83.repetition_type; + name = other83.name; + num_children = other83.num_children; + converted_type = other83.converted_type; + scale = other83.scale; + precision = other83.precision; + field_id = other83.field_id; + logicalType = other83.logicalType; + __isset = other83.__isset; +} +SchemaElement::SchemaElement(SchemaElement&& other84) noexcept { + type = other84.type; + type_length = other84.type_length; + repetition_type = other84.repetition_type; + name = std::move(other84.name); + num_children = other84.num_children; + converted_type = other84.converted_type; + scale = other84.scale; + precision = other84.precision; + field_id = other84.field_id; + logicalType = std::move(other84.logicalType); + __isset = other84.__isset; +} +SchemaElement& SchemaElement::operator=(const SchemaElement& other85) { + type = other85.type; + type_length = other85.type_length; + repetition_type = other85.repetition_type; + name = other85.name; + num_children = other85.num_children; + converted_type = other85.converted_type; + scale = other85.scale; + precision = other85.precision; + field_id = other85.field_id; + logicalType = other85.logicalType; + __isset = other85.__isset; return *this; } -SchemaElement& SchemaElement::operator=(SchemaElement&& other82) noexcept { - type = other82.type; - type_length = other82.type_length; - repetition_type = other82.repetition_type; - name = std::move(other82.name); - num_children = other82.num_children; - converted_type = other82.converted_type; - scale = other82.scale; - precision = other82.precision; - field_id = other82.field_id; - logicalType = std::move(other82.logicalType); - __isset = other82.__isset; +SchemaElement& SchemaElement::operator=(SchemaElement&& other86) noexcept { + type = other86.type; + type_length = other86.type_length; + repetition_type = other86.repetition_type; + name = std::move(other86.name); + num_children = other86.num_children; + converted_type = other86.converted_type; + scale = other86.scale; + precision = other86.precision; + field_id = other86.field_id; + logicalType = std::move(other86.logicalType); + __isset = other86.__isset; return *this; } void SchemaElement::printTo(std::ostream& out) const { @@ -3168,9 +3267,9 @@ uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 2: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast83; - xfer += iprot->readI32(ecast83); - this->encoding = static_cast(ecast83); + int32_t ecast87; + xfer += iprot->readI32(ecast87); + this->encoding = static_cast(ecast87); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3178,9 +3277,9 @@ uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 3: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast84; - xfer += iprot->readI32(ecast84); - this->definition_level_encoding = static_cast(ecast84); + int32_t ecast88; + xfer += iprot->readI32(ecast88); + this->definition_level_encoding = static_cast(ecast88); isset_definition_level_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3188,9 +3287,9 @@ uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast85; - xfer += iprot->readI32(ecast85); - this->repetition_level_encoding = static_cast(ecast85); + int32_t ecast89; + xfer += iprot->readI32(ecast89); + this->repetition_level_encoding = static_cast(ecast89); isset_repetition_level_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3265,38 +3364,38 @@ void swap(DataPageHeader &a, DataPageHeader &b) { swap(a.__isset, b.__isset); } -DataPageHeader::DataPageHeader(const DataPageHeader& other86) { - num_values = other86.num_values; - encoding = other86.encoding; - definition_level_encoding = other86.definition_level_encoding; - repetition_level_encoding = other86.repetition_level_encoding; - statistics = other86.statistics; - __isset = other86.__isset; -} -DataPageHeader::DataPageHeader(DataPageHeader&& other87) noexcept { - num_values = other87.num_values; - encoding = other87.encoding; - definition_level_encoding = other87.definition_level_encoding; - repetition_level_encoding = other87.repetition_level_encoding; - statistics = std::move(other87.statistics); - __isset = other87.__isset; -} -DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other88) { - num_values = other88.num_values; - encoding = other88.encoding; - definition_level_encoding = other88.definition_level_encoding; - repetition_level_encoding = other88.repetition_level_encoding; - statistics = other88.statistics; - __isset = other88.__isset; +DataPageHeader::DataPageHeader(const DataPageHeader& other90) { + num_values = other90.num_values; + encoding = other90.encoding; + definition_level_encoding = other90.definition_level_encoding; + repetition_level_encoding = other90.repetition_level_encoding; + statistics = other90.statistics; + __isset = other90.__isset; +} +DataPageHeader::DataPageHeader(DataPageHeader&& other91) noexcept { + num_values = other91.num_values; + encoding = other91.encoding; + definition_level_encoding = other91.definition_level_encoding; + repetition_level_encoding = other91.repetition_level_encoding; + statistics = std::move(other91.statistics); + __isset = other91.__isset; +} +DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other92) { + num_values = other92.num_values; + encoding = other92.encoding; + definition_level_encoding = other92.definition_level_encoding; + repetition_level_encoding = other92.repetition_level_encoding; + statistics = other92.statistics; + __isset = other92.__isset; return *this; } -DataPageHeader& DataPageHeader::operator=(DataPageHeader&& other89) noexcept { - num_values = other89.num_values; - encoding = other89.encoding; - definition_level_encoding = other89.definition_level_encoding; - repetition_level_encoding = other89.repetition_level_encoding; - statistics = std::move(other89.statistics); - __isset = other89.__isset; +DataPageHeader& DataPageHeader::operator=(DataPageHeader&& other93) noexcept { + num_values = other93.num_values; + encoding = other93.encoding; + definition_level_encoding = other93.definition_level_encoding; + repetition_level_encoding = other93.repetition_level_encoding; + statistics = std::move(other93.statistics); + __isset = other93.__isset; return *this; } void DataPageHeader::printTo(std::ostream& out) const { @@ -3365,18 +3464,18 @@ void swap(IndexPageHeader &a, IndexPageHeader &b) { (void) b; } -IndexPageHeader::IndexPageHeader(const IndexPageHeader& other90) noexcept { - (void) other90; +IndexPageHeader::IndexPageHeader(const IndexPageHeader& other94) noexcept { + (void) other94; } -IndexPageHeader::IndexPageHeader(IndexPageHeader&& other91) noexcept { - (void) other91; +IndexPageHeader::IndexPageHeader(IndexPageHeader&& other95) noexcept { + (void) other95; } -IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other92) noexcept { - (void) other92; +IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other96) noexcept { + (void) other96; return *this; } -IndexPageHeader& IndexPageHeader::operator=(IndexPageHeader&& other93) noexcept { - (void) other93; +IndexPageHeader& IndexPageHeader::operator=(IndexPageHeader&& other97) noexcept { + (void) other97; return *this; } void IndexPageHeader::printTo(std::ostream& out) const { @@ -3442,9 +3541,9 @@ uint32_t DictionaryPageHeader::read(::apache::thrift::protocol::TProtocol* iprot break; case 2: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast94; - xfer += iprot->readI32(ecast94); - this->encoding = static_cast(ecast94); + int32_t ecast98; + xfer += iprot->readI32(ecast98); + this->encoding = static_cast(ecast98); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3505,30 +3604,30 @@ void swap(DictionaryPageHeader &a, DictionaryPageHeader &b) { swap(a.__isset, b.__isset); } -DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other95) noexcept { - num_values = other95.num_values; - encoding = other95.encoding; - is_sorted = other95.is_sorted; - __isset = other95.__isset; +DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other99) noexcept { + num_values = other99.num_values; + encoding = other99.encoding; + is_sorted = other99.is_sorted; + __isset = other99.__isset; } -DictionaryPageHeader::DictionaryPageHeader(DictionaryPageHeader&& other96) noexcept { - num_values = other96.num_values; - encoding = other96.encoding; - is_sorted = other96.is_sorted; - __isset = other96.__isset; +DictionaryPageHeader::DictionaryPageHeader(DictionaryPageHeader&& other100) noexcept { + num_values = other100.num_values; + encoding = other100.encoding; + is_sorted = other100.is_sorted; + __isset = other100.__isset; } -DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other97) noexcept { - num_values = other97.num_values; - encoding = other97.encoding; - is_sorted = other97.is_sorted; - __isset = other97.__isset; +DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other101) noexcept { + num_values = other101.num_values; + encoding = other101.encoding; + is_sorted = other101.is_sorted; + __isset = other101.__isset; return *this; } -DictionaryPageHeader& DictionaryPageHeader::operator=(DictionaryPageHeader&& other98) noexcept { - num_values = other98.num_values; - encoding = other98.encoding; - is_sorted = other98.is_sorted; - __isset = other98.__isset; +DictionaryPageHeader& DictionaryPageHeader::operator=(DictionaryPageHeader&& other102) noexcept { + num_values = other102.num_values; + encoding = other102.encoding; + is_sorted = other102.is_sorted; + __isset = other102.__isset; return *this; } void DictionaryPageHeader::printTo(std::ostream& out) const { @@ -3638,9 +3737,9 @@ uint32_t DataPageHeaderV2::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast99; - xfer += iprot->readI32(ecast99); - this->encoding = static_cast(ecast99); + int32_t ecast103; + xfer += iprot->readI32(ecast103); + this->encoding = static_cast(ecast103); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3759,50 +3858,50 @@ void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b) { swap(a.__isset, b.__isset); } -DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other100) { - num_values = other100.num_values; - num_nulls = other100.num_nulls; - num_rows = other100.num_rows; - encoding = other100.encoding; - definition_levels_byte_length = other100.definition_levels_byte_length; - repetition_levels_byte_length = other100.repetition_levels_byte_length; - is_compressed = other100.is_compressed; - statistics = other100.statistics; - __isset = other100.__isset; -} -DataPageHeaderV2::DataPageHeaderV2(DataPageHeaderV2&& other101) noexcept { - num_values = other101.num_values; - num_nulls = other101.num_nulls; - num_rows = other101.num_rows; - encoding = other101.encoding; - definition_levels_byte_length = other101.definition_levels_byte_length; - repetition_levels_byte_length = other101.repetition_levels_byte_length; - is_compressed = other101.is_compressed; - statistics = std::move(other101.statistics); - __isset = other101.__isset; -} -DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other102) { - num_values = other102.num_values; - num_nulls = other102.num_nulls; - num_rows = other102.num_rows; - encoding = other102.encoding; - definition_levels_byte_length = other102.definition_levels_byte_length; - repetition_levels_byte_length = other102.repetition_levels_byte_length; - is_compressed = other102.is_compressed; - statistics = other102.statistics; - __isset = other102.__isset; +DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other104) { + num_values = other104.num_values; + num_nulls = other104.num_nulls; + num_rows = other104.num_rows; + encoding = other104.encoding; + definition_levels_byte_length = other104.definition_levels_byte_length; + repetition_levels_byte_length = other104.repetition_levels_byte_length; + is_compressed = other104.is_compressed; + statistics = other104.statistics; + __isset = other104.__isset; +} +DataPageHeaderV2::DataPageHeaderV2(DataPageHeaderV2&& other105) noexcept { + num_values = other105.num_values; + num_nulls = other105.num_nulls; + num_rows = other105.num_rows; + encoding = other105.encoding; + definition_levels_byte_length = other105.definition_levels_byte_length; + repetition_levels_byte_length = other105.repetition_levels_byte_length; + is_compressed = other105.is_compressed; + statistics = std::move(other105.statistics); + __isset = other105.__isset; +} +DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other106) { + num_values = other106.num_values; + num_nulls = other106.num_nulls; + num_rows = other106.num_rows; + encoding = other106.encoding; + definition_levels_byte_length = other106.definition_levels_byte_length; + repetition_levels_byte_length = other106.repetition_levels_byte_length; + is_compressed = other106.is_compressed; + statistics = other106.statistics; + __isset = other106.__isset; return *this; } -DataPageHeaderV2& DataPageHeaderV2::operator=(DataPageHeaderV2&& other103) noexcept { - num_values = other103.num_values; - num_nulls = other103.num_nulls; - num_rows = other103.num_rows; - encoding = other103.encoding; - definition_levels_byte_length = other103.definition_levels_byte_length; - repetition_levels_byte_length = other103.repetition_levels_byte_length; - is_compressed = other103.is_compressed; - statistics = std::move(other103.statistics); - __isset = other103.__isset; +DataPageHeaderV2& DataPageHeaderV2::operator=(DataPageHeaderV2&& other107) noexcept { + num_values = other107.num_values; + num_nulls = other107.num_nulls; + num_rows = other107.num_rows; + encoding = other107.encoding; + definition_levels_byte_length = other107.definition_levels_byte_length; + repetition_levels_byte_length = other107.repetition_levels_byte_length; + is_compressed = other107.is_compressed; + statistics = std::move(other107.statistics); + __isset = other107.__isset; return *this; } void DataPageHeaderV2::printTo(std::ostream& out) const { @@ -3874,18 +3973,18 @@ void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b) { (void) b; } -SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other104) noexcept { - (void) other104; +SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other108) noexcept { + (void) other108; } -SplitBlockAlgorithm::SplitBlockAlgorithm(SplitBlockAlgorithm&& other105) noexcept { - (void) other105; +SplitBlockAlgorithm::SplitBlockAlgorithm(SplitBlockAlgorithm&& other109) noexcept { + (void) other109; } -SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other106) noexcept { - (void) other106; +SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other110) noexcept { + (void) other110; return *this; } -SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(SplitBlockAlgorithm&& other107) noexcept { - (void) other107; +SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(SplitBlockAlgorithm&& other111) noexcept { + (void) other111; return *this; } void SplitBlockAlgorithm::printTo(std::ostream& out) const { @@ -3972,22 +4071,22 @@ void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b) { swap(a.__isset, b.__isset); } -BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other108) noexcept { - BLOCK = other108.BLOCK; - __isset = other108.__isset; +BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other112) noexcept { + BLOCK = other112.BLOCK; + __isset = other112.__isset; } -BloomFilterAlgorithm::BloomFilterAlgorithm(BloomFilterAlgorithm&& other109) noexcept { - BLOCK = std::move(other109.BLOCK); - __isset = other109.__isset; +BloomFilterAlgorithm::BloomFilterAlgorithm(BloomFilterAlgorithm&& other113) noexcept { + BLOCK = std::move(other113.BLOCK); + __isset = other113.__isset; } -BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other110) noexcept { - BLOCK = other110.BLOCK; - __isset = other110.__isset; +BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other114) noexcept { + BLOCK = other114.BLOCK; + __isset = other114.__isset; return *this; } -BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(BloomFilterAlgorithm&& other111) noexcept { - BLOCK = std::move(other111.BLOCK); - __isset = other111.__isset; +BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(BloomFilterAlgorithm&& other115) noexcept { + BLOCK = std::move(other115.BLOCK); + __isset = other115.__isset; return *this; } void BloomFilterAlgorithm::printTo(std::ostream& out) const { @@ -4052,18 +4151,18 @@ void swap(XxHash &a, XxHash &b) { (void) b; } -XxHash::XxHash(const XxHash& other112) noexcept { - (void) other112; +XxHash::XxHash(const XxHash& other116) noexcept { + (void) other116; } -XxHash::XxHash(XxHash&& other113) noexcept { - (void) other113; +XxHash::XxHash(XxHash&& other117) noexcept { + (void) other117; } -XxHash& XxHash::operator=(const XxHash& other114) noexcept { - (void) other114; +XxHash& XxHash::operator=(const XxHash& other118) noexcept { + (void) other118; return *this; } -XxHash& XxHash::operator=(XxHash&& other115) noexcept { - (void) other115; +XxHash& XxHash::operator=(XxHash&& other119) noexcept { + (void) other119; return *this; } void XxHash::printTo(std::ostream& out) const { @@ -4150,22 +4249,22 @@ void swap(BloomFilterHash &a, BloomFilterHash &b) { swap(a.__isset, b.__isset); } -BloomFilterHash::BloomFilterHash(const BloomFilterHash& other116) noexcept { - XXHASH = other116.XXHASH; - __isset = other116.__isset; +BloomFilterHash::BloomFilterHash(const BloomFilterHash& other120) noexcept { + XXHASH = other120.XXHASH; + __isset = other120.__isset; } -BloomFilterHash::BloomFilterHash(BloomFilterHash&& other117) noexcept { - XXHASH = std::move(other117.XXHASH); - __isset = other117.__isset; +BloomFilterHash::BloomFilterHash(BloomFilterHash&& other121) noexcept { + XXHASH = std::move(other121.XXHASH); + __isset = other121.__isset; } -BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other118) noexcept { - XXHASH = other118.XXHASH; - __isset = other118.__isset; +BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other122) noexcept { + XXHASH = other122.XXHASH; + __isset = other122.__isset; return *this; } -BloomFilterHash& BloomFilterHash::operator=(BloomFilterHash&& other119) noexcept { - XXHASH = std::move(other119.XXHASH); - __isset = other119.__isset; +BloomFilterHash& BloomFilterHash::operator=(BloomFilterHash&& other123) noexcept { + XXHASH = std::move(other123.XXHASH); + __isset = other123.__isset; return *this; } void BloomFilterHash::printTo(std::ostream& out) const { @@ -4230,18 +4329,18 @@ void swap(Uncompressed &a, Uncompressed &b) { (void) b; } -Uncompressed::Uncompressed(const Uncompressed& other120) noexcept { - (void) other120; +Uncompressed::Uncompressed(const Uncompressed& other124) noexcept { + (void) other124; } -Uncompressed::Uncompressed(Uncompressed&& other121) noexcept { - (void) other121; +Uncompressed::Uncompressed(Uncompressed&& other125) noexcept { + (void) other125; } -Uncompressed& Uncompressed::operator=(const Uncompressed& other122) noexcept { - (void) other122; +Uncompressed& Uncompressed::operator=(const Uncompressed& other126) noexcept { + (void) other126; return *this; } -Uncompressed& Uncompressed::operator=(Uncompressed&& other123) noexcept { - (void) other123; +Uncompressed& Uncompressed::operator=(Uncompressed&& other127) noexcept { + (void) other127; return *this; } void Uncompressed::printTo(std::ostream& out) const { @@ -4328,22 +4427,22 @@ void swap(BloomFilterCompression &a, BloomFilterCompression &b) { swap(a.__isset, b.__isset); } -BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other124) noexcept { - UNCOMPRESSED = other124.UNCOMPRESSED; - __isset = other124.__isset; +BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other128) noexcept { + UNCOMPRESSED = other128.UNCOMPRESSED; + __isset = other128.__isset; } -BloomFilterCompression::BloomFilterCompression(BloomFilterCompression&& other125) noexcept { - UNCOMPRESSED = std::move(other125.UNCOMPRESSED); - __isset = other125.__isset; +BloomFilterCompression::BloomFilterCompression(BloomFilterCompression&& other129) noexcept { + UNCOMPRESSED = std::move(other129.UNCOMPRESSED); + __isset = other129.__isset; } -BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other126) noexcept { - UNCOMPRESSED = other126.UNCOMPRESSED; - __isset = other126.__isset; +BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other130) noexcept { + UNCOMPRESSED = other130.UNCOMPRESSED; + __isset = other130.__isset; return *this; } -BloomFilterCompression& BloomFilterCompression::operator=(BloomFilterCompression&& other127) noexcept { - UNCOMPRESSED = std::move(other127.UNCOMPRESSED); - __isset = other127.__isset; +BloomFilterCompression& BloomFilterCompression::operator=(BloomFilterCompression&& other131) noexcept { + UNCOMPRESSED = std::move(other131.UNCOMPRESSED); + __isset = other131.__isset; return *this; } void BloomFilterCompression::printTo(std::ostream& out) const { @@ -4491,30 +4590,30 @@ void swap(BloomFilterHeader &a, BloomFilterHeader &b) { swap(a.compression, b.compression); } -BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other128) noexcept { - numBytes = other128.numBytes; - algorithm = other128.algorithm; - hash = other128.hash; - compression = other128.compression; +BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other132) noexcept { + numBytes = other132.numBytes; + algorithm = other132.algorithm; + hash = other132.hash; + compression = other132.compression; } -BloomFilterHeader::BloomFilterHeader(BloomFilterHeader&& other129) noexcept { - numBytes = other129.numBytes; - algorithm = std::move(other129.algorithm); - hash = std::move(other129.hash); - compression = std::move(other129.compression); +BloomFilterHeader::BloomFilterHeader(BloomFilterHeader&& other133) noexcept { + numBytes = other133.numBytes; + algorithm = std::move(other133.algorithm); + hash = std::move(other133.hash); + compression = std::move(other133.compression); } -BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other130) noexcept { - numBytes = other130.numBytes; - algorithm = other130.algorithm; - hash = other130.hash; - compression = other130.compression; +BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other134) noexcept { + numBytes = other134.numBytes; + algorithm = other134.algorithm; + hash = other134.hash; + compression = other134.compression; return *this; } -BloomFilterHeader& BloomFilterHeader::operator=(BloomFilterHeader&& other131) noexcept { - numBytes = other131.numBytes; - algorithm = std::move(other131.algorithm); - hash = std::move(other131.hash); - compression = std::move(other131.compression); +BloomFilterHeader& BloomFilterHeader::operator=(BloomFilterHeader&& other135) noexcept { + numBytes = other135.numBytes; + algorithm = std::move(other135.algorithm); + hash = std::move(other135.hash); + compression = std::move(other135.compression); return *this; } void BloomFilterHeader::printTo(std::ostream& out) const { @@ -4601,9 +4700,9 @@ uint32_t PageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast132; - xfer += iprot->readI32(ecast132); - this->type = static_cast(ecast132); + int32_t ecast136; + xfer += iprot->readI32(ecast136); + this->type = static_cast(ecast136); isset_type = true; } else { xfer += iprot->skip(ftype); @@ -4743,50 +4842,50 @@ void swap(PageHeader &a, PageHeader &b) { swap(a.__isset, b.__isset); } -PageHeader::PageHeader(const PageHeader& other133) { - type = other133.type; - uncompressed_page_size = other133.uncompressed_page_size; - compressed_page_size = other133.compressed_page_size; - crc = other133.crc; - data_page_header = other133.data_page_header; - index_page_header = other133.index_page_header; - dictionary_page_header = other133.dictionary_page_header; - data_page_header_v2 = other133.data_page_header_v2; - __isset = other133.__isset; -} -PageHeader::PageHeader(PageHeader&& other134) noexcept { - type = other134.type; - uncompressed_page_size = other134.uncompressed_page_size; - compressed_page_size = other134.compressed_page_size; - crc = other134.crc; - data_page_header = std::move(other134.data_page_header); - index_page_header = std::move(other134.index_page_header); - dictionary_page_header = std::move(other134.dictionary_page_header); - data_page_header_v2 = std::move(other134.data_page_header_v2); - __isset = other134.__isset; -} -PageHeader& PageHeader::operator=(const PageHeader& other135) { - type = other135.type; - uncompressed_page_size = other135.uncompressed_page_size; - compressed_page_size = other135.compressed_page_size; - crc = other135.crc; - data_page_header = other135.data_page_header; - index_page_header = other135.index_page_header; - dictionary_page_header = other135.dictionary_page_header; - data_page_header_v2 = other135.data_page_header_v2; - __isset = other135.__isset; +PageHeader::PageHeader(const PageHeader& other137) { + type = other137.type; + uncompressed_page_size = other137.uncompressed_page_size; + compressed_page_size = other137.compressed_page_size; + crc = other137.crc; + data_page_header = other137.data_page_header; + index_page_header = other137.index_page_header; + dictionary_page_header = other137.dictionary_page_header; + data_page_header_v2 = other137.data_page_header_v2; + __isset = other137.__isset; +} +PageHeader::PageHeader(PageHeader&& other138) noexcept { + type = other138.type; + uncompressed_page_size = other138.uncompressed_page_size; + compressed_page_size = other138.compressed_page_size; + crc = other138.crc; + data_page_header = std::move(other138.data_page_header); + index_page_header = std::move(other138.index_page_header); + dictionary_page_header = std::move(other138.dictionary_page_header); + data_page_header_v2 = std::move(other138.data_page_header_v2); + __isset = other138.__isset; +} +PageHeader& PageHeader::operator=(const PageHeader& other139) { + type = other139.type; + uncompressed_page_size = other139.uncompressed_page_size; + compressed_page_size = other139.compressed_page_size; + crc = other139.crc; + data_page_header = other139.data_page_header; + index_page_header = other139.index_page_header; + dictionary_page_header = other139.dictionary_page_header; + data_page_header_v2 = other139.data_page_header_v2; + __isset = other139.__isset; return *this; } -PageHeader& PageHeader::operator=(PageHeader&& other136) noexcept { - type = other136.type; - uncompressed_page_size = other136.uncompressed_page_size; - compressed_page_size = other136.compressed_page_size; - crc = other136.crc; - data_page_header = std::move(other136.data_page_header); - index_page_header = std::move(other136.index_page_header); - dictionary_page_header = std::move(other136.dictionary_page_header); - data_page_header_v2 = std::move(other136.data_page_header_v2); - __isset = other136.__isset; +PageHeader& PageHeader::operator=(PageHeader&& other140) noexcept { + type = other140.type; + uncompressed_page_size = other140.uncompressed_page_size; + compressed_page_size = other140.compressed_page_size; + crc = other140.crc; + data_page_header = std::move(other140.data_page_header); + index_page_header = std::move(other140.index_page_header); + dictionary_page_header = std::move(other140.dictionary_page_header); + data_page_header_v2 = std::move(other140.data_page_header_v2); + __isset = other140.__isset; return *this; } void PageHeader::printTo(std::ostream& out) const { @@ -4901,26 +5000,26 @@ void swap(KeyValue &a, KeyValue &b) { swap(a.__isset, b.__isset); } -KeyValue::KeyValue(const KeyValue& other137) { - key = other137.key; - value = other137.value; - __isset = other137.__isset; +KeyValue::KeyValue(const KeyValue& other141) { + key = other141.key; + value = other141.value; + __isset = other141.__isset; } -KeyValue::KeyValue(KeyValue&& other138) noexcept { - key = std::move(other138.key); - value = std::move(other138.value); - __isset = other138.__isset; +KeyValue::KeyValue(KeyValue&& other142) noexcept { + key = std::move(other142.key); + value = std::move(other142.value); + __isset = other142.__isset; } -KeyValue& KeyValue::operator=(const KeyValue& other139) { - key = other139.key; - value = other139.value; - __isset = other139.__isset; +KeyValue& KeyValue::operator=(const KeyValue& other143) { + key = other143.key; + value = other143.value; + __isset = other143.__isset; return *this; } -KeyValue& KeyValue::operator=(KeyValue&& other140) noexcept { - key = std::move(other140.key); - value = std::move(other140.value); - __isset = other140.__isset; +KeyValue& KeyValue::operator=(KeyValue&& other144) noexcept { + key = std::move(other144.key); + value = std::move(other144.value); + __isset = other144.__isset; return *this; } void KeyValue::printTo(std::ostream& out) const { @@ -5049,26 +5148,26 @@ void swap(SortingColumn &a, SortingColumn &b) { swap(a.nulls_first, b.nulls_first); } -SortingColumn::SortingColumn(const SortingColumn& other141) noexcept { - column_idx = other141.column_idx; - descending = other141.descending; - nulls_first = other141.nulls_first; +SortingColumn::SortingColumn(const SortingColumn& other145) noexcept { + column_idx = other145.column_idx; + descending = other145.descending; + nulls_first = other145.nulls_first; } -SortingColumn::SortingColumn(SortingColumn&& other142) noexcept { - column_idx = other142.column_idx; - descending = other142.descending; - nulls_first = other142.nulls_first; +SortingColumn::SortingColumn(SortingColumn&& other146) noexcept { + column_idx = other146.column_idx; + descending = other146.descending; + nulls_first = other146.nulls_first; } -SortingColumn& SortingColumn::operator=(const SortingColumn& other143) noexcept { - column_idx = other143.column_idx; - descending = other143.descending; - nulls_first = other143.nulls_first; +SortingColumn& SortingColumn::operator=(const SortingColumn& other147) noexcept { + column_idx = other147.column_idx; + descending = other147.descending; + nulls_first = other147.nulls_first; return *this; } -SortingColumn& SortingColumn::operator=(SortingColumn&& other144) noexcept { - column_idx = other144.column_idx; - descending = other144.descending; - nulls_first = other144.nulls_first; +SortingColumn& SortingColumn::operator=(SortingColumn&& other148) noexcept { + column_idx = other148.column_idx; + descending = other148.descending; + nulls_first = other148.nulls_first; return *this; } void SortingColumn::printTo(std::ostream& out) const { @@ -5129,9 +5228,9 @@ uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast145; - xfer += iprot->readI32(ecast145); - this->page_type = static_cast(ecast145); + int32_t ecast149; + xfer += iprot->readI32(ecast149); + this->page_type = static_cast(ecast149); isset_page_type = true; } else { xfer += iprot->skip(ftype); @@ -5139,9 +5238,9 @@ uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 2: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast146; - xfer += iprot->readI32(ecast146); - this->encoding = static_cast(ecast146); + int32_t ecast150; + xfer += iprot->readI32(ecast150); + this->encoding = static_cast(ecast150); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -5202,26 +5301,26 @@ void swap(PageEncodingStats &a, PageEncodingStats &b) { swap(a.count, b.count); } -PageEncodingStats::PageEncodingStats(const PageEncodingStats& other147) noexcept { - page_type = other147.page_type; - encoding = other147.encoding; - count = other147.count; +PageEncodingStats::PageEncodingStats(const PageEncodingStats& other151) noexcept { + page_type = other151.page_type; + encoding = other151.encoding; + count = other151.count; } -PageEncodingStats::PageEncodingStats(PageEncodingStats&& other148) noexcept { - page_type = other148.page_type; - encoding = other148.encoding; - count = other148.count; +PageEncodingStats::PageEncodingStats(PageEncodingStats&& other152) noexcept { + page_type = other152.page_type; + encoding = other152.encoding; + count = other152.count; } -PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other149) noexcept { - page_type = other149.page_type; - encoding = other149.encoding; - count = other149.count; +PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other153) noexcept { + page_type = other153.page_type; + encoding = other153.encoding; + count = other153.count; return *this; } -PageEncodingStats& PageEncodingStats::operator=(PageEncodingStats&& other150) noexcept { - page_type = other150.page_type; - encoding = other150.encoding; - count = other150.count; +PageEncodingStats& PageEncodingStats::operator=(PageEncodingStats&& other154) noexcept { + page_type = other154.page_type; + encoding = other154.encoding; + count = other154.count; return *this; } void PageEncodingStats::printTo(std::ostream& out) const { @@ -5337,9 +5436,9 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast151; - xfer += iprot->readI32(ecast151); - this->type = static_cast(ecast151); + int32_t ecast155; + xfer += iprot->readI32(ecast155); + this->type = static_cast(ecast155); isset_type = true; } else { xfer += iprot->skip(ftype); @@ -5349,16 +5448,16 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->encodings.clear(); - uint32_t _size152; - ::apache::thrift::protocol::TType _etype155; - xfer += iprot->readListBegin(_etype155, _size152); - this->encodings.resize(_size152); - uint32_t _i156; - for (_i156 = 0; _i156 < _size152; ++_i156) + uint32_t _size156; + ::apache::thrift::protocol::TType _etype159; + xfer += iprot->readListBegin(_etype159, _size156); + this->encodings.resize(_size156); + uint32_t _i160; + for (_i160 = 0; _i160 < _size156; ++_i160) { - int32_t ecast157; - xfer += iprot->readI32(ecast157); - this->encodings[_i156] = static_cast(ecast157); + int32_t ecast161; + xfer += iprot->readI32(ecast161); + this->encodings[_i160] = static_cast(ecast161); } xfer += iprot->readListEnd(); } @@ -5371,14 +5470,14 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->path_in_schema.clear(); - uint32_t _size158; - ::apache::thrift::protocol::TType _etype161; - xfer += iprot->readListBegin(_etype161, _size158); - this->path_in_schema.resize(_size158); - uint32_t _i162; - for (_i162 = 0; _i162 < _size158; ++_i162) + uint32_t _size162; + ::apache::thrift::protocol::TType _etype165; + xfer += iprot->readListBegin(_etype165, _size162); + this->path_in_schema.resize(_size162); + uint32_t _i166; + for (_i166 = 0; _i166 < _size162; ++_i166) { - xfer += iprot->readString(this->path_in_schema[_i162]); + xfer += iprot->readString(this->path_in_schema[_i166]); } xfer += iprot->readListEnd(); } @@ -5389,9 +5488,9 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast163; - xfer += iprot->readI32(ecast163); - this->codec = static_cast(ecast163); + int32_t ecast167; + xfer += iprot->readI32(ecast167); + this->codec = static_cast(ecast167); isset_codec = true; } else { xfer += iprot->skip(ftype); @@ -5425,14 +5524,14 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->key_value_metadata.clear(); - uint32_t _size164; - ::apache::thrift::protocol::TType _etype167; - xfer += iprot->readListBegin(_etype167, _size164); - this->key_value_metadata.resize(_size164); - uint32_t _i168; - for (_i168 = 0; _i168 < _size164; ++_i168) + uint32_t _size168; + ::apache::thrift::protocol::TType _etype171; + xfer += iprot->readListBegin(_etype171, _size168); + this->key_value_metadata.resize(_size168); + uint32_t _i172; + for (_i172 = 0; _i172 < _size168; ++_i172) { - xfer += this->key_value_metadata[_i168].read(iprot); + xfer += this->key_value_metadata[_i172].read(iprot); } xfer += iprot->readListEnd(); } @@ -5477,14 +5576,14 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->encoding_stats.clear(); - uint32_t _size169; - ::apache::thrift::protocol::TType _etype172; - xfer += iprot->readListBegin(_etype172, _size169); - this->encoding_stats.resize(_size169); - uint32_t _i173; - for (_i173 = 0; _i173 < _size169; ++_i173) + uint32_t _size173; + ::apache::thrift::protocol::TType _etype176; + xfer += iprot->readListBegin(_etype176, _size173); + this->encoding_stats.resize(_size173); + uint32_t _i177; + for (_i177 = 0; _i177 < _size173; ++_i177) { - xfer += this->encoding_stats[_i173].read(iprot); + xfer += this->encoding_stats[_i177].read(iprot); } xfer += iprot->readListEnd(); } @@ -5541,10 +5640,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("encodings", ::apache::thrift::protocol::T_LIST, 2); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I32, static_cast(this->encodings.size())); - std::vector ::const_iterator _iter174; - for (_iter174 = this->encodings.begin(); _iter174 != this->encodings.end(); ++_iter174) + std::vector ::const_iterator _iter178; + for (_iter178 = this->encodings.begin(); _iter178 != this->encodings.end(); ++_iter178) { - xfer += oprot->writeI32(static_cast((*_iter174))); + xfer += oprot->writeI32(static_cast((*_iter178))); } xfer += oprot->writeListEnd(); } @@ -5553,10 +5652,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 3); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->path_in_schema.size())); - std::vector ::const_iterator _iter175; - for (_iter175 = this->path_in_schema.begin(); _iter175 != this->path_in_schema.end(); ++_iter175) + std::vector ::const_iterator _iter179; + for (_iter179 = this->path_in_schema.begin(); _iter179 != this->path_in_schema.end(); ++_iter179) { - xfer += oprot->writeString((*_iter175)); + xfer += oprot->writeString((*_iter179)); } xfer += oprot->writeListEnd(); } @@ -5582,10 +5681,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 8); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->key_value_metadata.size())); - std::vector ::const_iterator _iter176; - for (_iter176 = this->key_value_metadata.begin(); _iter176 != this->key_value_metadata.end(); ++_iter176) + std::vector ::const_iterator _iter180; + for (_iter180 = this->key_value_metadata.begin(); _iter180 != this->key_value_metadata.end(); ++_iter180) { - xfer += (*_iter176).write(oprot); + xfer += (*_iter180).write(oprot); } xfer += oprot->writeListEnd(); } @@ -5614,10 +5713,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("encoding_stats", ::apache::thrift::protocol::T_LIST, 13); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->encoding_stats.size())); - std::vector ::const_iterator _iter177; - for (_iter177 = this->encoding_stats.begin(); _iter177 != this->encoding_stats.end(); ++_iter177) + std::vector ::const_iterator _iter181; + for (_iter181 = this->encoding_stats.begin(); _iter181 != this->encoding_stats.end(); ++_iter181) { - xfer += (*_iter177).write(oprot); + xfer += (*_iter181).write(oprot); } xfer += oprot->writeListEnd(); } @@ -5652,74 +5751,74 @@ void swap(ColumnMetaData &a, ColumnMetaData &b) { swap(a.__isset, b.__isset); } -ColumnMetaData::ColumnMetaData(const ColumnMetaData& other178) { - type = other178.type; - encodings = other178.encodings; - path_in_schema = other178.path_in_schema; - codec = other178.codec; - num_values = other178.num_values; - total_uncompressed_size = other178.total_uncompressed_size; - total_compressed_size = other178.total_compressed_size; - key_value_metadata = other178.key_value_metadata; - data_page_offset = other178.data_page_offset; - index_page_offset = other178.index_page_offset; - dictionary_page_offset = other178.dictionary_page_offset; - statistics = other178.statistics; - encoding_stats = other178.encoding_stats; - bloom_filter_offset = other178.bloom_filter_offset; - __isset = other178.__isset; -} -ColumnMetaData::ColumnMetaData(ColumnMetaData&& other179) noexcept { - type = other179.type; - encodings = std::move(other179.encodings); - path_in_schema = std::move(other179.path_in_schema); - codec = other179.codec; - num_values = other179.num_values; - total_uncompressed_size = other179.total_uncompressed_size; - total_compressed_size = other179.total_compressed_size; - key_value_metadata = std::move(other179.key_value_metadata); - data_page_offset = other179.data_page_offset; - index_page_offset = other179.index_page_offset; - dictionary_page_offset = other179.dictionary_page_offset; - statistics = std::move(other179.statistics); - encoding_stats = std::move(other179.encoding_stats); - bloom_filter_offset = other179.bloom_filter_offset; - __isset = other179.__isset; -} -ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other180) { - type = other180.type; - encodings = other180.encodings; - path_in_schema = other180.path_in_schema; - codec = other180.codec; - num_values = other180.num_values; - total_uncompressed_size = other180.total_uncompressed_size; - total_compressed_size = other180.total_compressed_size; - key_value_metadata = other180.key_value_metadata; - data_page_offset = other180.data_page_offset; - index_page_offset = other180.index_page_offset; - dictionary_page_offset = other180.dictionary_page_offset; - statistics = other180.statistics; - encoding_stats = other180.encoding_stats; - bloom_filter_offset = other180.bloom_filter_offset; - __isset = other180.__isset; +ColumnMetaData::ColumnMetaData(const ColumnMetaData& other182) { + type = other182.type; + encodings = other182.encodings; + path_in_schema = other182.path_in_schema; + codec = other182.codec; + num_values = other182.num_values; + total_uncompressed_size = other182.total_uncompressed_size; + total_compressed_size = other182.total_compressed_size; + key_value_metadata = other182.key_value_metadata; + data_page_offset = other182.data_page_offset; + index_page_offset = other182.index_page_offset; + dictionary_page_offset = other182.dictionary_page_offset; + statistics = other182.statistics; + encoding_stats = other182.encoding_stats; + bloom_filter_offset = other182.bloom_filter_offset; + __isset = other182.__isset; +} +ColumnMetaData::ColumnMetaData(ColumnMetaData&& other183) noexcept { + type = other183.type; + encodings = std::move(other183.encodings); + path_in_schema = std::move(other183.path_in_schema); + codec = other183.codec; + num_values = other183.num_values; + total_uncompressed_size = other183.total_uncompressed_size; + total_compressed_size = other183.total_compressed_size; + key_value_metadata = std::move(other183.key_value_metadata); + data_page_offset = other183.data_page_offset; + index_page_offset = other183.index_page_offset; + dictionary_page_offset = other183.dictionary_page_offset; + statistics = std::move(other183.statistics); + encoding_stats = std::move(other183.encoding_stats); + bloom_filter_offset = other183.bloom_filter_offset; + __isset = other183.__isset; +} +ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other184) { + type = other184.type; + encodings = other184.encodings; + path_in_schema = other184.path_in_schema; + codec = other184.codec; + num_values = other184.num_values; + total_uncompressed_size = other184.total_uncompressed_size; + total_compressed_size = other184.total_compressed_size; + key_value_metadata = other184.key_value_metadata; + data_page_offset = other184.data_page_offset; + index_page_offset = other184.index_page_offset; + dictionary_page_offset = other184.dictionary_page_offset; + statistics = other184.statistics; + encoding_stats = other184.encoding_stats; + bloom_filter_offset = other184.bloom_filter_offset; + __isset = other184.__isset; return *this; } -ColumnMetaData& ColumnMetaData::operator=(ColumnMetaData&& other181) noexcept { - type = other181.type; - encodings = std::move(other181.encodings); - path_in_schema = std::move(other181.path_in_schema); - codec = other181.codec; - num_values = other181.num_values; - total_uncompressed_size = other181.total_uncompressed_size; - total_compressed_size = other181.total_compressed_size; - key_value_metadata = std::move(other181.key_value_metadata); - data_page_offset = other181.data_page_offset; - index_page_offset = other181.index_page_offset; - dictionary_page_offset = other181.dictionary_page_offset; - statistics = std::move(other181.statistics); - encoding_stats = std::move(other181.encoding_stats); - bloom_filter_offset = other181.bloom_filter_offset; - __isset = other181.__isset; +ColumnMetaData& ColumnMetaData::operator=(ColumnMetaData&& other185) noexcept { + type = other185.type; + encodings = std::move(other185.encodings); + path_in_schema = std::move(other185.path_in_schema); + codec = other185.codec; + num_values = other185.num_values; + total_uncompressed_size = other185.total_uncompressed_size; + total_compressed_size = other185.total_compressed_size; + key_value_metadata = std::move(other185.key_value_metadata); + data_page_offset = other185.data_page_offset; + index_page_offset = other185.index_page_offset; + dictionary_page_offset = other185.dictionary_page_offset; + statistics = std::move(other185.statistics); + encoding_stats = std::move(other185.encoding_stats); + bloom_filter_offset = other185.bloom_filter_offset; + __isset = other185.__isset; return *this; } void ColumnMetaData::printTo(std::ostream& out) const { @@ -5797,18 +5896,18 @@ void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b) { (void) b; } -EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other182) noexcept { - (void) other182; +EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other186) noexcept { + (void) other186; } -EncryptionWithFooterKey::EncryptionWithFooterKey(EncryptionWithFooterKey&& other183) noexcept { - (void) other183; +EncryptionWithFooterKey::EncryptionWithFooterKey(EncryptionWithFooterKey&& other187) noexcept { + (void) other187; } -EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other184) noexcept { - (void) other184; +EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other188) noexcept { + (void) other188; return *this; } -EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(EncryptionWithFooterKey&& other185) noexcept { - (void) other185; +EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(EncryptionWithFooterKey&& other189) noexcept { + (void) other189; return *this; } void EncryptionWithFooterKey::printTo(std::ostream& out) const { @@ -5863,14 +5962,14 @@ uint32_t EncryptionWithColumnKey::read(::apache::thrift::protocol::TProtocol* ip if (ftype == ::apache::thrift::protocol::T_LIST) { { this->path_in_schema.clear(); - uint32_t _size186; - ::apache::thrift::protocol::TType _etype189; - xfer += iprot->readListBegin(_etype189, _size186); - this->path_in_schema.resize(_size186); - uint32_t _i190; - for (_i190 = 0; _i190 < _size186; ++_i190) + uint32_t _size190; + ::apache::thrift::protocol::TType _etype193; + xfer += iprot->readListBegin(_etype193, _size190); + this->path_in_schema.resize(_size190); + uint32_t _i194; + for (_i194 = 0; _i194 < _size190; ++_i194) { - xfer += iprot->readString(this->path_in_schema[_i190]); + xfer += iprot->readString(this->path_in_schema[_i194]); } xfer += iprot->readListEnd(); } @@ -5909,10 +6008,10 @@ uint32_t EncryptionWithColumnKey::write(::apache::thrift::protocol::TProtocol* o xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->path_in_schema.size())); - std::vector ::const_iterator _iter191; - for (_iter191 = this->path_in_schema.begin(); _iter191 != this->path_in_schema.end(); ++_iter191) + std::vector ::const_iterator _iter195; + for (_iter195 = this->path_in_schema.begin(); _iter195 != this->path_in_schema.end(); ++_iter195) { - xfer += oprot->writeString((*_iter191)); + xfer += oprot->writeString((*_iter195)); } xfer += oprot->writeListEnd(); } @@ -5935,26 +6034,26 @@ void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b) { swap(a.__isset, b.__isset); } -EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other192) { - path_in_schema = other192.path_in_schema; - key_metadata = other192.key_metadata; - __isset = other192.__isset; +EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other196) { + path_in_schema = other196.path_in_schema; + key_metadata = other196.key_metadata; + __isset = other196.__isset; } -EncryptionWithColumnKey::EncryptionWithColumnKey(EncryptionWithColumnKey&& other193) noexcept { - path_in_schema = std::move(other193.path_in_schema); - key_metadata = std::move(other193.key_metadata); - __isset = other193.__isset; +EncryptionWithColumnKey::EncryptionWithColumnKey(EncryptionWithColumnKey&& other197) noexcept { + path_in_schema = std::move(other197.path_in_schema); + key_metadata = std::move(other197.key_metadata); + __isset = other197.__isset; } -EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other194) { - path_in_schema = other194.path_in_schema; - key_metadata = other194.key_metadata; - __isset = other194.__isset; +EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other198) { + path_in_schema = other198.path_in_schema; + key_metadata = other198.key_metadata; + __isset = other198.__isset; return *this; } -EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(EncryptionWithColumnKey&& other195) noexcept { - path_in_schema = std::move(other195.path_in_schema); - key_metadata = std::move(other195.key_metadata); - __isset = other195.__isset; +EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(EncryptionWithColumnKey&& other199) noexcept { + path_in_schema = std::move(other199.path_in_schema); + key_metadata = std::move(other199.key_metadata); + __isset = other199.__isset; return *this; } void EncryptionWithColumnKey::printTo(std::ostream& out) const { @@ -6062,26 +6161,26 @@ void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b) { swap(a.__isset, b.__isset); } -ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other196) { - ENCRYPTION_WITH_FOOTER_KEY = other196.ENCRYPTION_WITH_FOOTER_KEY; - ENCRYPTION_WITH_COLUMN_KEY = other196.ENCRYPTION_WITH_COLUMN_KEY; - __isset = other196.__isset; +ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other200) { + ENCRYPTION_WITH_FOOTER_KEY = other200.ENCRYPTION_WITH_FOOTER_KEY; + ENCRYPTION_WITH_COLUMN_KEY = other200.ENCRYPTION_WITH_COLUMN_KEY; + __isset = other200.__isset; } -ColumnCryptoMetaData::ColumnCryptoMetaData(ColumnCryptoMetaData&& other197) noexcept { - ENCRYPTION_WITH_FOOTER_KEY = std::move(other197.ENCRYPTION_WITH_FOOTER_KEY); - ENCRYPTION_WITH_COLUMN_KEY = std::move(other197.ENCRYPTION_WITH_COLUMN_KEY); - __isset = other197.__isset; +ColumnCryptoMetaData::ColumnCryptoMetaData(ColumnCryptoMetaData&& other201) noexcept { + ENCRYPTION_WITH_FOOTER_KEY = std::move(other201.ENCRYPTION_WITH_FOOTER_KEY); + ENCRYPTION_WITH_COLUMN_KEY = std::move(other201.ENCRYPTION_WITH_COLUMN_KEY); + __isset = other201.__isset; } -ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other198) { - ENCRYPTION_WITH_FOOTER_KEY = other198.ENCRYPTION_WITH_FOOTER_KEY; - ENCRYPTION_WITH_COLUMN_KEY = other198.ENCRYPTION_WITH_COLUMN_KEY; - __isset = other198.__isset; +ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other202) { + ENCRYPTION_WITH_FOOTER_KEY = other202.ENCRYPTION_WITH_FOOTER_KEY; + ENCRYPTION_WITH_COLUMN_KEY = other202.ENCRYPTION_WITH_COLUMN_KEY; + __isset = other202.__isset; return *this; } -ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(ColumnCryptoMetaData&& other199) noexcept { - ENCRYPTION_WITH_FOOTER_KEY = std::move(other199.ENCRYPTION_WITH_FOOTER_KEY); - ENCRYPTION_WITH_COLUMN_KEY = std::move(other199.ENCRYPTION_WITH_COLUMN_KEY); - __isset = other199.__isset; +ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(ColumnCryptoMetaData&& other203) noexcept { + ENCRYPTION_WITH_FOOTER_KEY = std::move(other203.ENCRYPTION_WITH_FOOTER_KEY); + ENCRYPTION_WITH_COLUMN_KEY = std::move(other203.ENCRYPTION_WITH_COLUMN_KEY); + __isset = other203.__isset; return *this; } void ColumnCryptoMetaData::printTo(std::ostream& out) const { @@ -6323,54 +6422,54 @@ void swap(ColumnChunk &a, ColumnChunk &b) { swap(a.__isset, b.__isset); } -ColumnChunk::ColumnChunk(const ColumnChunk& other200) { - file_path = other200.file_path; - file_offset = other200.file_offset; - meta_data = other200.meta_data; - offset_index_offset = other200.offset_index_offset; - offset_index_length = other200.offset_index_length; - column_index_offset = other200.column_index_offset; - column_index_length = other200.column_index_length; - crypto_metadata = other200.crypto_metadata; - encrypted_column_metadata = other200.encrypted_column_metadata; - __isset = other200.__isset; -} -ColumnChunk::ColumnChunk(ColumnChunk&& other201) noexcept { - file_path = std::move(other201.file_path); - file_offset = other201.file_offset; - meta_data = std::move(other201.meta_data); - offset_index_offset = other201.offset_index_offset; - offset_index_length = other201.offset_index_length; - column_index_offset = other201.column_index_offset; - column_index_length = other201.column_index_length; - crypto_metadata = std::move(other201.crypto_metadata); - encrypted_column_metadata = std::move(other201.encrypted_column_metadata); - __isset = other201.__isset; -} -ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other202) { - file_path = other202.file_path; - file_offset = other202.file_offset; - meta_data = other202.meta_data; - offset_index_offset = other202.offset_index_offset; - offset_index_length = other202.offset_index_length; - column_index_offset = other202.column_index_offset; - column_index_length = other202.column_index_length; - crypto_metadata = other202.crypto_metadata; - encrypted_column_metadata = other202.encrypted_column_metadata; - __isset = other202.__isset; +ColumnChunk::ColumnChunk(const ColumnChunk& other204) { + file_path = other204.file_path; + file_offset = other204.file_offset; + meta_data = other204.meta_data; + offset_index_offset = other204.offset_index_offset; + offset_index_length = other204.offset_index_length; + column_index_offset = other204.column_index_offset; + column_index_length = other204.column_index_length; + crypto_metadata = other204.crypto_metadata; + encrypted_column_metadata = other204.encrypted_column_metadata; + __isset = other204.__isset; +} +ColumnChunk::ColumnChunk(ColumnChunk&& other205) noexcept { + file_path = std::move(other205.file_path); + file_offset = other205.file_offset; + meta_data = std::move(other205.meta_data); + offset_index_offset = other205.offset_index_offset; + offset_index_length = other205.offset_index_length; + column_index_offset = other205.column_index_offset; + column_index_length = other205.column_index_length; + crypto_metadata = std::move(other205.crypto_metadata); + encrypted_column_metadata = std::move(other205.encrypted_column_metadata); + __isset = other205.__isset; +} +ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other206) { + file_path = other206.file_path; + file_offset = other206.file_offset; + meta_data = other206.meta_data; + offset_index_offset = other206.offset_index_offset; + offset_index_length = other206.offset_index_length; + column_index_offset = other206.column_index_offset; + column_index_length = other206.column_index_length; + crypto_metadata = other206.crypto_metadata; + encrypted_column_metadata = other206.encrypted_column_metadata; + __isset = other206.__isset; return *this; } -ColumnChunk& ColumnChunk::operator=(ColumnChunk&& other203) noexcept { - file_path = std::move(other203.file_path); - file_offset = other203.file_offset; - meta_data = std::move(other203.meta_data); - offset_index_offset = other203.offset_index_offset; - offset_index_length = other203.offset_index_length; - column_index_offset = other203.column_index_offset; - column_index_length = other203.column_index_length; - crypto_metadata = std::move(other203.crypto_metadata); - encrypted_column_metadata = std::move(other203.encrypted_column_metadata); - __isset = other203.__isset; +ColumnChunk& ColumnChunk::operator=(ColumnChunk&& other207) noexcept { + file_path = std::move(other207.file_path); + file_offset = other207.file_offset; + meta_data = std::move(other207.meta_data); + offset_index_offset = other207.offset_index_offset; + offset_index_length = other207.offset_index_length; + column_index_offset = other207.column_index_offset; + column_index_length = other207.column_index_length; + crypto_metadata = std::move(other207.crypto_metadata); + encrypted_column_metadata = std::move(other207.encrypted_column_metadata); + __isset = other207.__isset; return *this; } void ColumnChunk::printTo(std::ostream& out) const { @@ -6459,14 +6558,14 @@ uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->columns.clear(); - uint32_t _size204; - ::apache::thrift::protocol::TType _etype207; - xfer += iprot->readListBegin(_etype207, _size204); - this->columns.resize(_size204); - uint32_t _i208; - for (_i208 = 0; _i208 < _size204; ++_i208) + uint32_t _size208; + ::apache::thrift::protocol::TType _etype211; + xfer += iprot->readListBegin(_etype211, _size208); + this->columns.resize(_size208); + uint32_t _i212; + for (_i212 = 0; _i212 < _size208; ++_i212) { - xfer += this->columns[_i208].read(iprot); + xfer += this->columns[_i212].read(iprot); } xfer += iprot->readListEnd(); } @@ -6495,14 +6594,14 @@ uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->sorting_columns.clear(); - uint32_t _size209; - ::apache::thrift::protocol::TType _etype212; - xfer += iprot->readListBegin(_etype212, _size209); - this->sorting_columns.resize(_size209); - uint32_t _i213; - for (_i213 = 0; _i213 < _size209; ++_i213) + uint32_t _size213; + ::apache::thrift::protocol::TType _etype216; + xfer += iprot->readListBegin(_etype216, _size213); + this->sorting_columns.resize(_size213); + uint32_t _i217; + for (_i217 = 0; _i217 < _size213; ++_i217) { - xfer += this->sorting_columns[_i213].read(iprot); + xfer += this->sorting_columns[_i217].read(iprot); } xfer += iprot->readListEnd(); } @@ -6561,10 +6660,10 @@ uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const { xfer += oprot->writeFieldBegin("columns", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->columns.size())); - std::vector ::const_iterator _iter214; - for (_iter214 = this->columns.begin(); _iter214 != this->columns.end(); ++_iter214) + std::vector ::const_iterator _iter218; + for (_iter218 = this->columns.begin(); _iter218 != this->columns.end(); ++_iter218) { - xfer += (*_iter214).write(oprot); + xfer += (*_iter218).write(oprot); } xfer += oprot->writeListEnd(); } @@ -6582,10 +6681,10 @@ uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const { xfer += oprot->writeFieldBegin("sorting_columns", ::apache::thrift::protocol::T_LIST, 4); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->sorting_columns.size())); - std::vector ::const_iterator _iter215; - for (_iter215 = this->sorting_columns.begin(); _iter215 != this->sorting_columns.end(); ++_iter215) + std::vector ::const_iterator _iter219; + for (_iter219 = this->sorting_columns.begin(); _iter219 != this->sorting_columns.end(); ++_iter219) { - xfer += (*_iter215).write(oprot); + xfer += (*_iter219).write(oprot); } xfer += oprot->writeListEnd(); } @@ -6623,46 +6722,46 @@ void swap(RowGroup &a, RowGroup &b) { swap(a.__isset, b.__isset); } -RowGroup::RowGroup(const RowGroup& other216) { - columns = other216.columns; - total_byte_size = other216.total_byte_size; - num_rows = other216.num_rows; - sorting_columns = other216.sorting_columns; - file_offset = other216.file_offset; - total_compressed_size = other216.total_compressed_size; - ordinal = other216.ordinal; - __isset = other216.__isset; -} -RowGroup::RowGroup(RowGroup&& other217) noexcept { - columns = std::move(other217.columns); - total_byte_size = other217.total_byte_size; - num_rows = other217.num_rows; - sorting_columns = std::move(other217.sorting_columns); - file_offset = other217.file_offset; - total_compressed_size = other217.total_compressed_size; - ordinal = other217.ordinal; - __isset = other217.__isset; -} -RowGroup& RowGroup::operator=(const RowGroup& other218) { - columns = other218.columns; - total_byte_size = other218.total_byte_size; - num_rows = other218.num_rows; - sorting_columns = other218.sorting_columns; - file_offset = other218.file_offset; - total_compressed_size = other218.total_compressed_size; - ordinal = other218.ordinal; - __isset = other218.__isset; +RowGroup::RowGroup(const RowGroup& other220) { + columns = other220.columns; + total_byte_size = other220.total_byte_size; + num_rows = other220.num_rows; + sorting_columns = other220.sorting_columns; + file_offset = other220.file_offset; + total_compressed_size = other220.total_compressed_size; + ordinal = other220.ordinal; + __isset = other220.__isset; +} +RowGroup::RowGroup(RowGroup&& other221) noexcept { + columns = std::move(other221.columns); + total_byte_size = other221.total_byte_size; + num_rows = other221.num_rows; + sorting_columns = std::move(other221.sorting_columns); + file_offset = other221.file_offset; + total_compressed_size = other221.total_compressed_size; + ordinal = other221.ordinal; + __isset = other221.__isset; +} +RowGroup& RowGroup::operator=(const RowGroup& other222) { + columns = other222.columns; + total_byte_size = other222.total_byte_size; + num_rows = other222.num_rows; + sorting_columns = other222.sorting_columns; + file_offset = other222.file_offset; + total_compressed_size = other222.total_compressed_size; + ordinal = other222.ordinal; + __isset = other222.__isset; return *this; } -RowGroup& RowGroup::operator=(RowGroup&& other219) noexcept { - columns = std::move(other219.columns); - total_byte_size = other219.total_byte_size; - num_rows = other219.num_rows; - sorting_columns = std::move(other219.sorting_columns); - file_offset = other219.file_offset; - total_compressed_size = other219.total_compressed_size; - ordinal = other219.ordinal; - __isset = other219.__isset; +RowGroup& RowGroup::operator=(RowGroup&& other223) noexcept { + columns = std::move(other223.columns); + total_byte_size = other223.total_byte_size; + num_rows = other223.num_rows; + sorting_columns = std::move(other223.sorting_columns); + file_offset = other223.file_offset; + total_compressed_size = other223.total_compressed_size; + ordinal = other223.ordinal; + __isset = other223.__isset; return *this; } void RowGroup::printTo(std::ostream& out) const { @@ -6733,18 +6832,18 @@ void swap(TypeDefinedOrder &a, TypeDefinedOrder &b) { (void) b; } -TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other220) noexcept { - (void) other220; +TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other224) noexcept { + (void) other224; } -TypeDefinedOrder::TypeDefinedOrder(TypeDefinedOrder&& other221) noexcept { - (void) other221; +TypeDefinedOrder::TypeDefinedOrder(TypeDefinedOrder&& other225) noexcept { + (void) other225; } -TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other222) noexcept { - (void) other222; +TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other226) noexcept { + (void) other226; return *this; } -TypeDefinedOrder& TypeDefinedOrder::operator=(TypeDefinedOrder&& other223) noexcept { - (void) other223; +TypeDefinedOrder& TypeDefinedOrder::operator=(TypeDefinedOrder&& other227) noexcept { + (void) other227; return *this; } void TypeDefinedOrder::printTo(std::ostream& out) const { @@ -6831,22 +6930,22 @@ void swap(ColumnOrder &a, ColumnOrder &b) { swap(a.__isset, b.__isset); } -ColumnOrder::ColumnOrder(const ColumnOrder& other224) noexcept { - TYPE_ORDER = other224.TYPE_ORDER; - __isset = other224.__isset; +ColumnOrder::ColumnOrder(const ColumnOrder& other228) noexcept { + TYPE_ORDER = other228.TYPE_ORDER; + __isset = other228.__isset; } -ColumnOrder::ColumnOrder(ColumnOrder&& other225) noexcept { - TYPE_ORDER = std::move(other225.TYPE_ORDER); - __isset = other225.__isset; +ColumnOrder::ColumnOrder(ColumnOrder&& other229) noexcept { + TYPE_ORDER = std::move(other229.TYPE_ORDER); + __isset = other229.__isset; } -ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other226) noexcept { - TYPE_ORDER = other226.TYPE_ORDER; - __isset = other226.__isset; +ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other230) noexcept { + TYPE_ORDER = other230.TYPE_ORDER; + __isset = other230.__isset; return *this; } -ColumnOrder& ColumnOrder::operator=(ColumnOrder&& other227) noexcept { - TYPE_ORDER = std::move(other227.TYPE_ORDER); - __isset = other227.__isset; +ColumnOrder& ColumnOrder::operator=(ColumnOrder&& other231) noexcept { + TYPE_ORDER = std::move(other231.TYPE_ORDER); + __isset = other231.__isset; return *this; } void ColumnOrder::printTo(std::ostream& out) const { @@ -6974,26 +7073,26 @@ void swap(PageLocation &a, PageLocation &b) { swap(a.first_row_index, b.first_row_index); } -PageLocation::PageLocation(const PageLocation& other228) noexcept { - offset = other228.offset; - compressed_page_size = other228.compressed_page_size; - first_row_index = other228.first_row_index; +PageLocation::PageLocation(const PageLocation& other232) noexcept { + offset = other232.offset; + compressed_page_size = other232.compressed_page_size; + first_row_index = other232.first_row_index; } -PageLocation::PageLocation(PageLocation&& other229) noexcept { - offset = other229.offset; - compressed_page_size = other229.compressed_page_size; - first_row_index = other229.first_row_index; +PageLocation::PageLocation(PageLocation&& other233) noexcept { + offset = other233.offset; + compressed_page_size = other233.compressed_page_size; + first_row_index = other233.first_row_index; } -PageLocation& PageLocation::operator=(const PageLocation& other230) noexcept { - offset = other230.offset; - compressed_page_size = other230.compressed_page_size; - first_row_index = other230.first_row_index; +PageLocation& PageLocation::operator=(const PageLocation& other234) noexcept { + offset = other234.offset; + compressed_page_size = other234.compressed_page_size; + first_row_index = other234.first_row_index; return *this; } -PageLocation& PageLocation::operator=(PageLocation&& other231) noexcept { - offset = other231.offset; - compressed_page_size = other231.compressed_page_size; - first_row_index = other231.first_row_index; +PageLocation& PageLocation::operator=(PageLocation&& other235) noexcept { + offset = other235.offset; + compressed_page_size = other235.compressed_page_size; + first_row_index = other235.first_row_index; return *this; } void PageLocation::printTo(std::ostream& out) const { @@ -7046,14 +7145,14 @@ uint32_t OffsetIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->page_locations.clear(); - uint32_t _size232; - ::apache::thrift::protocol::TType _etype235; - xfer += iprot->readListBegin(_etype235, _size232); - this->page_locations.resize(_size232); - uint32_t _i236; - for (_i236 = 0; _i236 < _size232; ++_i236) + uint32_t _size236; + ::apache::thrift::protocol::TType _etype239; + xfer += iprot->readListBegin(_etype239, _size236); + this->page_locations.resize(_size236); + uint32_t _i240; + for (_i240 = 0; _i240 < _size236; ++_i240) { - xfer += this->page_locations[_i236].read(iprot); + xfer += this->page_locations[_i240].read(iprot); } xfer += iprot->readListEnd(); } @@ -7084,10 +7183,10 @@ uint32_t OffsetIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("page_locations", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->page_locations.size())); - std::vector ::const_iterator _iter237; - for (_iter237 = this->page_locations.begin(); _iter237 != this->page_locations.end(); ++_iter237) + std::vector ::const_iterator _iter241; + for (_iter241 = this->page_locations.begin(); _iter241 != this->page_locations.end(); ++_iter241) { - xfer += (*_iter237).write(oprot); + xfer += (*_iter241).write(oprot); } xfer += oprot->writeListEnd(); } @@ -7103,18 +7202,18 @@ void swap(OffsetIndex &a, OffsetIndex &b) { swap(a.page_locations, b.page_locations); } -OffsetIndex::OffsetIndex(const OffsetIndex& other238) { - page_locations = other238.page_locations; +OffsetIndex::OffsetIndex(const OffsetIndex& other242) { + page_locations = other242.page_locations; } -OffsetIndex::OffsetIndex(OffsetIndex&& other239) noexcept { - page_locations = std::move(other239.page_locations); +OffsetIndex::OffsetIndex(OffsetIndex&& other243) noexcept { + page_locations = std::move(other243.page_locations); } -OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other240) { - page_locations = other240.page_locations; +OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other244) { + page_locations = other244.page_locations; return *this; } -OffsetIndex& OffsetIndex::operator=(OffsetIndex&& other241) noexcept { - page_locations = std::move(other241.page_locations); +OffsetIndex& OffsetIndex::operator=(OffsetIndex&& other245) noexcept { + page_locations = std::move(other245.page_locations); return *this; } void OffsetIndex::printTo(std::ostream& out) const { @@ -7185,14 +7284,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->null_pages.clear(); - uint32_t _size242; - ::apache::thrift::protocol::TType _etype245; - xfer += iprot->readListBegin(_etype245, _size242); - this->null_pages.resize(_size242); - uint32_t _i246; - for (_i246 = 0; _i246 < _size242; ++_i246) + uint32_t _size246; + ::apache::thrift::protocol::TType _etype249; + xfer += iprot->readListBegin(_etype249, _size246); + this->null_pages.resize(_size246); + uint32_t _i250; + for (_i250 = 0; _i250 < _size246; ++_i250) { - xfer += iprot->readBool(this->null_pages[_i246]); + xfer += iprot->readBool(this->null_pages[_i250]); } xfer += iprot->readListEnd(); } @@ -7205,14 +7304,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->min_values.clear(); - uint32_t _size247; - ::apache::thrift::protocol::TType _etype250; - xfer += iprot->readListBegin(_etype250, _size247); - this->min_values.resize(_size247); - uint32_t _i251; - for (_i251 = 0; _i251 < _size247; ++_i251) + uint32_t _size251; + ::apache::thrift::protocol::TType _etype254; + xfer += iprot->readListBegin(_etype254, _size251); + this->min_values.resize(_size251); + uint32_t _i255; + for (_i255 = 0; _i255 < _size251; ++_i255) { - xfer += iprot->readBinary(this->min_values[_i251]); + xfer += iprot->readBinary(this->min_values[_i255]); } xfer += iprot->readListEnd(); } @@ -7225,14 +7324,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->max_values.clear(); - uint32_t _size252; - ::apache::thrift::protocol::TType _etype255; - xfer += iprot->readListBegin(_etype255, _size252); - this->max_values.resize(_size252); - uint32_t _i256; - for (_i256 = 0; _i256 < _size252; ++_i256) + uint32_t _size256; + ::apache::thrift::protocol::TType _etype259; + xfer += iprot->readListBegin(_etype259, _size256); + this->max_values.resize(_size256); + uint32_t _i260; + for (_i260 = 0; _i260 < _size256; ++_i260) { - xfer += iprot->readBinary(this->max_values[_i256]); + xfer += iprot->readBinary(this->max_values[_i260]); } xfer += iprot->readListEnd(); } @@ -7243,9 +7342,9 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast257; - xfer += iprot->readI32(ecast257); - this->boundary_order = static_cast(ecast257); + int32_t ecast261; + xfer += iprot->readI32(ecast261); + this->boundary_order = static_cast(ecast261); isset_boundary_order = true; } else { xfer += iprot->skip(ftype); @@ -7255,14 +7354,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->null_counts.clear(); - uint32_t _size258; - ::apache::thrift::protocol::TType _etype261; - xfer += iprot->readListBegin(_etype261, _size258); - this->null_counts.resize(_size258); - uint32_t _i262; - for (_i262 = 0; _i262 < _size258; ++_i262) + uint32_t _size262; + ::apache::thrift::protocol::TType _etype265; + xfer += iprot->readListBegin(_etype265, _size262); + this->null_counts.resize(_size262); + uint32_t _i266; + for (_i266 = 0; _i266 < _size262; ++_i266) { - xfer += iprot->readI64(this->null_counts[_i262]); + xfer += iprot->readI64(this->null_counts[_i266]); } xfer += iprot->readListEnd(); } @@ -7299,10 +7398,10 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("null_pages", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_BOOL, static_cast(this->null_pages.size())); - std::vector ::const_iterator _iter263; - for (_iter263 = this->null_pages.begin(); _iter263 != this->null_pages.end(); ++_iter263) + std::vector ::const_iterator _iter267; + for (_iter267 = this->null_pages.begin(); _iter267 != this->null_pages.end(); ++_iter267) { - xfer += oprot->writeBool((*_iter263)); + xfer += oprot->writeBool((*_iter267)); } xfer += oprot->writeListEnd(); } @@ -7311,10 +7410,10 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("min_values", ::apache::thrift::protocol::T_LIST, 2); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->min_values.size())); - std::vector ::const_iterator _iter264; - for (_iter264 = this->min_values.begin(); _iter264 != this->min_values.end(); ++_iter264) + std::vector ::const_iterator _iter268; + for (_iter268 = this->min_values.begin(); _iter268 != this->min_values.end(); ++_iter268) { - xfer += oprot->writeBinary((*_iter264)); + xfer += oprot->writeBinary((*_iter268)); } xfer += oprot->writeListEnd(); } @@ -7323,10 +7422,10 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("max_values", ::apache::thrift::protocol::T_LIST, 3); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->max_values.size())); - std::vector ::const_iterator _iter265; - for (_iter265 = this->max_values.begin(); _iter265 != this->max_values.end(); ++_iter265) + std::vector ::const_iterator _iter269; + for (_iter269 = this->max_values.begin(); _iter269 != this->max_values.end(); ++_iter269) { - xfer += oprot->writeBinary((*_iter265)); + xfer += oprot->writeBinary((*_iter269)); } xfer += oprot->writeListEnd(); } @@ -7340,10 +7439,10 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("null_counts", ::apache::thrift::protocol::T_LIST, 5); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->null_counts.size())); - std::vector ::const_iterator _iter266; - for (_iter266 = this->null_counts.begin(); _iter266 != this->null_counts.end(); ++_iter266) + std::vector ::const_iterator _iter270; + for (_iter270 = this->null_counts.begin(); _iter270 != this->null_counts.end(); ++_iter270) { - xfer += oprot->writeI64((*_iter266)); + xfer += oprot->writeI64((*_iter270)); } xfer += oprot->writeListEnd(); } @@ -7364,38 +7463,38 @@ void swap(ColumnIndex &a, ColumnIndex &b) { swap(a.__isset, b.__isset); } -ColumnIndex::ColumnIndex(const ColumnIndex& other267) { - null_pages = other267.null_pages; - min_values = other267.min_values; - max_values = other267.max_values; - boundary_order = other267.boundary_order; - null_counts = other267.null_counts; - __isset = other267.__isset; -} -ColumnIndex::ColumnIndex(ColumnIndex&& other268) noexcept { - null_pages = std::move(other268.null_pages); - min_values = std::move(other268.min_values); - max_values = std::move(other268.max_values); - boundary_order = other268.boundary_order; - null_counts = std::move(other268.null_counts); - __isset = other268.__isset; -} -ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other269) { - null_pages = other269.null_pages; - min_values = other269.min_values; - max_values = other269.max_values; - boundary_order = other269.boundary_order; - null_counts = other269.null_counts; - __isset = other269.__isset; +ColumnIndex::ColumnIndex(const ColumnIndex& other271) { + null_pages = other271.null_pages; + min_values = other271.min_values; + max_values = other271.max_values; + boundary_order = other271.boundary_order; + null_counts = other271.null_counts; + __isset = other271.__isset; +} +ColumnIndex::ColumnIndex(ColumnIndex&& other272) noexcept { + null_pages = std::move(other272.null_pages); + min_values = std::move(other272.min_values); + max_values = std::move(other272.max_values); + boundary_order = other272.boundary_order; + null_counts = std::move(other272.null_counts); + __isset = other272.__isset; +} +ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other273) { + null_pages = other273.null_pages; + min_values = other273.min_values; + max_values = other273.max_values; + boundary_order = other273.boundary_order; + null_counts = other273.null_counts; + __isset = other273.__isset; return *this; } -ColumnIndex& ColumnIndex::operator=(ColumnIndex&& other270) noexcept { - null_pages = std::move(other270.null_pages); - min_values = std::move(other270.min_values); - max_values = std::move(other270.max_values); - boundary_order = other270.boundary_order; - null_counts = std::move(other270.null_counts); - __isset = other270.__isset; +ColumnIndex& ColumnIndex::operator=(ColumnIndex&& other274) noexcept { + null_pages = std::move(other274.null_pages); + min_values = std::move(other274.min_values); + max_values = std::move(other274.max_values); + boundary_order = other274.boundary_order; + null_counts = std::move(other274.null_counts); + __isset = other274.__isset; return *this; } void ColumnIndex::printTo(std::ostream& out) const { @@ -7525,30 +7624,30 @@ void swap(AesGcmV1 &a, AesGcmV1 &b) { swap(a.__isset, b.__isset); } -AesGcmV1::AesGcmV1(const AesGcmV1& other271) { - aad_prefix = other271.aad_prefix; - aad_file_unique = other271.aad_file_unique; - supply_aad_prefix = other271.supply_aad_prefix; - __isset = other271.__isset; +AesGcmV1::AesGcmV1(const AesGcmV1& other275) { + aad_prefix = other275.aad_prefix; + aad_file_unique = other275.aad_file_unique; + supply_aad_prefix = other275.supply_aad_prefix; + __isset = other275.__isset; } -AesGcmV1::AesGcmV1(AesGcmV1&& other272) noexcept { - aad_prefix = std::move(other272.aad_prefix); - aad_file_unique = std::move(other272.aad_file_unique); - supply_aad_prefix = other272.supply_aad_prefix; - __isset = other272.__isset; +AesGcmV1::AesGcmV1(AesGcmV1&& other276) noexcept { + aad_prefix = std::move(other276.aad_prefix); + aad_file_unique = std::move(other276.aad_file_unique); + supply_aad_prefix = other276.supply_aad_prefix; + __isset = other276.__isset; } -AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other273) { - aad_prefix = other273.aad_prefix; - aad_file_unique = other273.aad_file_unique; - supply_aad_prefix = other273.supply_aad_prefix; - __isset = other273.__isset; +AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other277) { + aad_prefix = other277.aad_prefix; + aad_file_unique = other277.aad_file_unique; + supply_aad_prefix = other277.supply_aad_prefix; + __isset = other277.__isset; return *this; } -AesGcmV1& AesGcmV1::operator=(AesGcmV1&& other274) noexcept { - aad_prefix = std::move(other274.aad_prefix); - aad_file_unique = std::move(other274.aad_file_unique); - supply_aad_prefix = other274.supply_aad_prefix; - __isset = other274.__isset; +AesGcmV1& AesGcmV1::operator=(AesGcmV1&& other278) noexcept { + aad_prefix = std::move(other278.aad_prefix); + aad_file_unique = std::move(other278.aad_file_unique); + supply_aad_prefix = other278.supply_aad_prefix; + __isset = other278.__isset; return *this; } void AesGcmV1::printTo(std::ostream& out) const { @@ -7676,30 +7775,30 @@ void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b) { swap(a.__isset, b.__isset); } -AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other275) { - aad_prefix = other275.aad_prefix; - aad_file_unique = other275.aad_file_unique; - supply_aad_prefix = other275.supply_aad_prefix; - __isset = other275.__isset; +AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other279) { + aad_prefix = other279.aad_prefix; + aad_file_unique = other279.aad_file_unique; + supply_aad_prefix = other279.supply_aad_prefix; + __isset = other279.__isset; } -AesGcmCtrV1::AesGcmCtrV1(AesGcmCtrV1&& other276) noexcept { - aad_prefix = std::move(other276.aad_prefix); - aad_file_unique = std::move(other276.aad_file_unique); - supply_aad_prefix = other276.supply_aad_prefix; - __isset = other276.__isset; +AesGcmCtrV1::AesGcmCtrV1(AesGcmCtrV1&& other280) noexcept { + aad_prefix = std::move(other280.aad_prefix); + aad_file_unique = std::move(other280.aad_file_unique); + supply_aad_prefix = other280.supply_aad_prefix; + __isset = other280.__isset; } -AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other277) { - aad_prefix = other277.aad_prefix; - aad_file_unique = other277.aad_file_unique; - supply_aad_prefix = other277.supply_aad_prefix; - __isset = other277.__isset; +AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other281) { + aad_prefix = other281.aad_prefix; + aad_file_unique = other281.aad_file_unique; + supply_aad_prefix = other281.supply_aad_prefix; + __isset = other281.__isset; return *this; } -AesGcmCtrV1& AesGcmCtrV1::operator=(AesGcmCtrV1&& other278) noexcept { - aad_prefix = std::move(other278.aad_prefix); - aad_file_unique = std::move(other278.aad_file_unique); - supply_aad_prefix = other278.supply_aad_prefix; - __isset = other278.__isset; +AesGcmCtrV1& AesGcmCtrV1::operator=(AesGcmCtrV1&& other282) noexcept { + aad_prefix = std::move(other282.aad_prefix); + aad_file_unique = std::move(other282.aad_file_unique); + supply_aad_prefix = other282.supply_aad_prefix; + __isset = other282.__isset; return *this; } void AesGcmCtrV1::printTo(std::ostream& out) const { @@ -7808,26 +7907,26 @@ void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b) { swap(a.__isset, b.__isset); } -EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other279) { - AES_GCM_V1 = other279.AES_GCM_V1; - AES_GCM_CTR_V1 = other279.AES_GCM_CTR_V1; - __isset = other279.__isset; +EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other283) { + AES_GCM_V1 = other283.AES_GCM_V1; + AES_GCM_CTR_V1 = other283.AES_GCM_CTR_V1; + __isset = other283.__isset; } -EncryptionAlgorithm::EncryptionAlgorithm(EncryptionAlgorithm&& other280) noexcept { - AES_GCM_V1 = std::move(other280.AES_GCM_V1); - AES_GCM_CTR_V1 = std::move(other280.AES_GCM_CTR_V1); - __isset = other280.__isset; +EncryptionAlgorithm::EncryptionAlgorithm(EncryptionAlgorithm&& other284) noexcept { + AES_GCM_V1 = std::move(other284.AES_GCM_V1); + AES_GCM_CTR_V1 = std::move(other284.AES_GCM_CTR_V1); + __isset = other284.__isset; } -EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other281) { - AES_GCM_V1 = other281.AES_GCM_V1; - AES_GCM_CTR_V1 = other281.AES_GCM_CTR_V1; - __isset = other281.__isset; +EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other285) { + AES_GCM_V1 = other285.AES_GCM_V1; + AES_GCM_CTR_V1 = other285.AES_GCM_CTR_V1; + __isset = other285.__isset; return *this; } -EncryptionAlgorithm& EncryptionAlgorithm::operator=(EncryptionAlgorithm&& other282) noexcept { - AES_GCM_V1 = std::move(other282.AES_GCM_V1); - AES_GCM_CTR_V1 = std::move(other282.AES_GCM_CTR_V1); - __isset = other282.__isset; +EncryptionAlgorithm& EncryptionAlgorithm::operator=(EncryptionAlgorithm&& other286) noexcept { + AES_GCM_V1 = std::move(other286.AES_GCM_V1); + AES_GCM_CTR_V1 = std::move(other286.AES_GCM_CTR_V1); + __isset = other286.__isset; return *this; } void EncryptionAlgorithm::printTo(std::ostream& out) const { @@ -7927,14 +8026,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->schema.clear(); - uint32_t _size283; - ::apache::thrift::protocol::TType _etype286; - xfer += iprot->readListBegin(_etype286, _size283); - this->schema.resize(_size283); - uint32_t _i287; - for (_i287 = 0; _i287 < _size283; ++_i287) + uint32_t _size287; + ::apache::thrift::protocol::TType _etype290; + xfer += iprot->readListBegin(_etype290, _size287); + this->schema.resize(_size287); + uint32_t _i291; + for (_i291 = 0; _i291 < _size287; ++_i291) { - xfer += this->schema[_i287].read(iprot); + xfer += this->schema[_i291].read(iprot); } xfer += iprot->readListEnd(); } @@ -7955,14 +8054,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->row_groups.clear(); - uint32_t _size288; - ::apache::thrift::protocol::TType _etype291; - xfer += iprot->readListBegin(_etype291, _size288); - this->row_groups.resize(_size288); - uint32_t _i292; - for (_i292 = 0; _i292 < _size288; ++_i292) + uint32_t _size292; + ::apache::thrift::protocol::TType _etype295; + xfer += iprot->readListBegin(_etype295, _size292); + this->row_groups.resize(_size292); + uint32_t _i296; + for (_i296 = 0; _i296 < _size292; ++_i296) { - xfer += this->row_groups[_i292].read(iprot); + xfer += this->row_groups[_i296].read(iprot); } xfer += iprot->readListEnd(); } @@ -7975,14 +8074,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->key_value_metadata.clear(); - uint32_t _size293; - ::apache::thrift::protocol::TType _etype296; - xfer += iprot->readListBegin(_etype296, _size293); - this->key_value_metadata.resize(_size293); - uint32_t _i297; - for (_i297 = 0; _i297 < _size293; ++_i297) + uint32_t _size297; + ::apache::thrift::protocol::TType _etype300; + xfer += iprot->readListBegin(_etype300, _size297); + this->key_value_metadata.resize(_size297); + uint32_t _i301; + for (_i301 = 0; _i301 < _size297; ++_i301) { - xfer += this->key_value_metadata[_i297].read(iprot); + xfer += this->key_value_metadata[_i301].read(iprot); } xfer += iprot->readListEnd(); } @@ -8003,14 +8102,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->column_orders.clear(); - uint32_t _size298; - ::apache::thrift::protocol::TType _etype301; - xfer += iprot->readListBegin(_etype301, _size298); - this->column_orders.resize(_size298); - uint32_t _i302; - for (_i302 = 0; _i302 < _size298; ++_i302) + uint32_t _size302; + ::apache::thrift::protocol::TType _etype305; + xfer += iprot->readListBegin(_etype305, _size302); + this->column_orders.resize(_size302); + uint32_t _i306; + for (_i306 = 0; _i306 < _size302; ++_i306) { - xfer += this->column_orders[_i302].read(iprot); + xfer += this->column_orders[_i306].read(iprot); } xfer += iprot->readListEnd(); } @@ -8067,10 +8166,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("schema", ::apache::thrift::protocol::T_LIST, 2); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->schema.size())); - std::vector ::const_iterator _iter303; - for (_iter303 = this->schema.begin(); _iter303 != this->schema.end(); ++_iter303) + std::vector ::const_iterator _iter307; + for (_iter307 = this->schema.begin(); _iter307 != this->schema.end(); ++_iter307) { - xfer += (*_iter303).write(oprot); + xfer += (*_iter307).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8083,10 +8182,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("row_groups", ::apache::thrift::protocol::T_LIST, 4); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->row_groups.size())); - std::vector ::const_iterator _iter304; - for (_iter304 = this->row_groups.begin(); _iter304 != this->row_groups.end(); ++_iter304) + std::vector ::const_iterator _iter308; + for (_iter308 = this->row_groups.begin(); _iter308 != this->row_groups.end(); ++_iter308) { - xfer += (*_iter304).write(oprot); + xfer += (*_iter308).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8096,10 +8195,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 5); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->key_value_metadata.size())); - std::vector ::const_iterator _iter305; - for (_iter305 = this->key_value_metadata.begin(); _iter305 != this->key_value_metadata.end(); ++_iter305) + std::vector ::const_iterator _iter309; + for (_iter309 = this->key_value_metadata.begin(); _iter309 != this->key_value_metadata.end(); ++_iter309) { - xfer += (*_iter305).write(oprot); + xfer += (*_iter309).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8114,10 +8213,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("column_orders", ::apache::thrift::protocol::T_LIST, 7); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->column_orders.size())); - std::vector ::const_iterator _iter306; - for (_iter306 = this->column_orders.begin(); _iter306 != this->column_orders.end(); ++_iter306) + std::vector ::const_iterator _iter310; + for (_iter310 = this->column_orders.begin(); _iter310 != this->column_orders.end(); ++_iter310) { - xfer += (*_iter306).write(oprot); + xfer += (*_iter310).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8152,54 +8251,54 @@ void swap(FileMetaData &a, FileMetaData &b) { swap(a.__isset, b.__isset); } -FileMetaData::FileMetaData(const FileMetaData& other307) { - version = other307.version; - schema = other307.schema; - num_rows = other307.num_rows; - row_groups = other307.row_groups; - key_value_metadata = other307.key_value_metadata; - created_by = other307.created_by; - column_orders = other307.column_orders; - encryption_algorithm = other307.encryption_algorithm; - footer_signing_key_metadata = other307.footer_signing_key_metadata; - __isset = other307.__isset; -} -FileMetaData::FileMetaData(FileMetaData&& other308) noexcept { - version = other308.version; - schema = std::move(other308.schema); - num_rows = other308.num_rows; - row_groups = std::move(other308.row_groups); - key_value_metadata = std::move(other308.key_value_metadata); - created_by = std::move(other308.created_by); - column_orders = std::move(other308.column_orders); - encryption_algorithm = std::move(other308.encryption_algorithm); - footer_signing_key_metadata = std::move(other308.footer_signing_key_metadata); - __isset = other308.__isset; -} -FileMetaData& FileMetaData::operator=(const FileMetaData& other309) { - version = other309.version; - schema = other309.schema; - num_rows = other309.num_rows; - row_groups = other309.row_groups; - key_value_metadata = other309.key_value_metadata; - created_by = other309.created_by; - column_orders = other309.column_orders; - encryption_algorithm = other309.encryption_algorithm; - footer_signing_key_metadata = other309.footer_signing_key_metadata; - __isset = other309.__isset; +FileMetaData::FileMetaData(const FileMetaData& other311) { + version = other311.version; + schema = other311.schema; + num_rows = other311.num_rows; + row_groups = other311.row_groups; + key_value_metadata = other311.key_value_metadata; + created_by = other311.created_by; + column_orders = other311.column_orders; + encryption_algorithm = other311.encryption_algorithm; + footer_signing_key_metadata = other311.footer_signing_key_metadata; + __isset = other311.__isset; +} +FileMetaData::FileMetaData(FileMetaData&& other312) noexcept { + version = other312.version; + schema = std::move(other312.schema); + num_rows = other312.num_rows; + row_groups = std::move(other312.row_groups); + key_value_metadata = std::move(other312.key_value_metadata); + created_by = std::move(other312.created_by); + column_orders = std::move(other312.column_orders); + encryption_algorithm = std::move(other312.encryption_algorithm); + footer_signing_key_metadata = std::move(other312.footer_signing_key_metadata); + __isset = other312.__isset; +} +FileMetaData& FileMetaData::operator=(const FileMetaData& other313) { + version = other313.version; + schema = other313.schema; + num_rows = other313.num_rows; + row_groups = other313.row_groups; + key_value_metadata = other313.key_value_metadata; + created_by = other313.created_by; + column_orders = other313.column_orders; + encryption_algorithm = other313.encryption_algorithm; + footer_signing_key_metadata = other313.footer_signing_key_metadata; + __isset = other313.__isset; return *this; } -FileMetaData& FileMetaData::operator=(FileMetaData&& other310) noexcept { - version = other310.version; - schema = std::move(other310.schema); - num_rows = other310.num_rows; - row_groups = std::move(other310.row_groups); - key_value_metadata = std::move(other310.key_value_metadata); - created_by = std::move(other310.created_by); - column_orders = std::move(other310.column_orders); - encryption_algorithm = std::move(other310.encryption_algorithm); - footer_signing_key_metadata = std::move(other310.footer_signing_key_metadata); - __isset = other310.__isset; +FileMetaData& FileMetaData::operator=(FileMetaData&& other314) noexcept { + version = other314.version; + schema = std::move(other314.schema); + num_rows = other314.num_rows; + row_groups = std::move(other314.row_groups); + key_value_metadata = std::move(other314.key_value_metadata); + created_by = std::move(other314.created_by); + column_orders = std::move(other314.column_orders); + encryption_algorithm = std::move(other314.encryption_algorithm); + footer_signing_key_metadata = std::move(other314.footer_signing_key_metadata); + __isset = other314.__isset; return *this; } void FileMetaData::printTo(std::ostream& out) const { @@ -8315,26 +8414,26 @@ void swap(FileCryptoMetaData &a, FileCryptoMetaData &b) { swap(a.__isset, b.__isset); } -FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other311) { - encryption_algorithm = other311.encryption_algorithm; - key_metadata = other311.key_metadata; - __isset = other311.__isset; +FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other315) { + encryption_algorithm = other315.encryption_algorithm; + key_metadata = other315.key_metadata; + __isset = other315.__isset; } -FileCryptoMetaData::FileCryptoMetaData(FileCryptoMetaData&& other312) noexcept { - encryption_algorithm = std::move(other312.encryption_algorithm); - key_metadata = std::move(other312.key_metadata); - __isset = other312.__isset; +FileCryptoMetaData::FileCryptoMetaData(FileCryptoMetaData&& other316) noexcept { + encryption_algorithm = std::move(other316.encryption_algorithm); + key_metadata = std::move(other316.key_metadata); + __isset = other316.__isset; } -FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other313) { - encryption_algorithm = other313.encryption_algorithm; - key_metadata = other313.key_metadata; - __isset = other313.__isset; +FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other317) { + encryption_algorithm = other317.encryption_algorithm; + key_metadata = other317.key_metadata; + __isset = other317.__isset; return *this; } -FileCryptoMetaData& FileCryptoMetaData::operator=(FileCryptoMetaData&& other314) noexcept { - encryption_algorithm = std::move(other314.encryption_algorithm); - key_metadata = std::move(other314.key_metadata); - __isset = other314.__isset; +FileCryptoMetaData& FileCryptoMetaData::operator=(FileCryptoMetaData&& other318) noexcept { + encryption_algorithm = std::move(other318.encryption_algorithm); + key_metadata = std::move(other318.key_metadata); + __isset = other318.__isset; return *this; } void FileCryptoMetaData::printTo(std::ostream& out) const { diff --git a/cpp/src/generated/parquet_types.h b/cpp/src/generated/parquet_types.h index 9f468b5051db3..199b4ae747667 100644 --- a/cpp/src/generated/parquet_types.h +++ b/cpp/src/generated/parquet_types.h @@ -359,6 +359,8 @@ class EnumType; class DateType; +class Float16Type; + class NullType; class DecimalType; @@ -770,6 +772,39 @@ void swap(DateType &a, DateType &b); std::ostream& operator<<(std::ostream& out, const DateType& obj); +class Float16Type : public virtual ::apache::thrift::TBase { + public: + + Float16Type(const Float16Type&) noexcept; + Float16Type(Float16Type&&) noexcept; + Float16Type& operator=(const Float16Type&) noexcept; + Float16Type& operator=(Float16Type&&) noexcept; + Float16Type() noexcept { + } + + virtual ~Float16Type() noexcept; + + bool operator == (const Float16Type & /* rhs */) const + { + return true; + } + bool operator != (const Float16Type &rhs) const { + return !(*this == rhs); + } + + bool operator < (const Float16Type & ) const; + + uint32_t read(::apache::thrift::protocol::TProtocol* iprot) override; + uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const override; + + virtual void printTo(std::ostream& out) const; +}; + +void swap(Float16Type &a, Float16Type &b); + +std::ostream& operator<<(std::ostream& out, const Float16Type& obj); + + /** * Logical type to annotate a column that is always null. * @@ -1253,7 +1288,7 @@ void swap(BsonType &a, BsonType &b); std::ostream& operator<<(std::ostream& out, const BsonType& obj); typedef struct _LogicalType__isset { - _LogicalType__isset() : STRING(false), MAP(false), LIST(false), ENUM(false), DECIMAL(false), DATE(false), TIME(false), TIMESTAMP(false), INTEGER(false), UNKNOWN(false), JSON(false), BSON(false), UUID(false) {} + _LogicalType__isset() : STRING(false), MAP(false), LIST(false), ENUM(false), DECIMAL(false), DATE(false), TIME(false), TIMESTAMP(false), INTEGER(false), UNKNOWN(false), JSON(false), BSON(false), UUID(false), FLOAT16(false) {} bool STRING :1; bool MAP :1; bool LIST :1; @@ -1267,6 +1302,7 @@ typedef struct _LogicalType__isset { bool JSON :1; bool BSON :1; bool UUID :1; + bool FLOAT16 :1; } _LogicalType__isset; /** @@ -1300,6 +1336,7 @@ class LogicalType : public virtual ::apache::thrift::TBase { JsonType JSON; BsonType BSON; UUIDType UUID; + Float16Type FLOAT16; _LogicalType__isset __isset; @@ -1329,6 +1366,8 @@ class LogicalType : public virtual ::apache::thrift::TBase { void __set_UUID(const UUIDType& val); + void __set_FLOAT16(const Float16Type& val); + bool operator == (const LogicalType & rhs) const { if (__isset.STRING != rhs.__isset.STRING) @@ -1383,6 +1422,10 @@ class LogicalType : public virtual ::apache::thrift::TBase { return false; else if (__isset.UUID && !(UUID == rhs.UUID)) return false; + if (__isset.FLOAT16 != rhs.__isset.FLOAT16) + return false; + else if (__isset.FLOAT16 && !(FLOAT16 == rhs.FLOAT16)) + return false; return true; } bool operator != (const LogicalType &rhs) const { diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 4e23d0fab5c69..fb9e53870583c 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -143,6 +143,8 @@ std::shared_ptr get_logical_type(const DataType& type) { return LogicalType::Date(); case ArrowId::DATE64: return LogicalType::Date(); + case ArrowId::HALF_FLOAT: + return LogicalType::Float16(); case ArrowId::TIMESTAMP: { const auto& ts_type = static_cast(type); const bool adjusted_to_utc = !(ts_type.timezone().empty()); @@ -220,6 +222,7 @@ ParquetType::type get_physical_type(const DataType& type) { case ArrowId::FIXED_SIZE_BINARY: case ArrowId::DECIMAL128: case ArrowId::DECIMAL256: + case ArrowId::HALF_FLOAT: return ParquetType::FIXED_LEN_BYTE_ARRAY; case ArrowId::DATE32: return ParquetType::INT32; @@ -525,6 +528,9 @@ static std::shared_ptr MakeSimpleSchema(const DataType& type, byte_width = static_cast(values_type).byte_width(); break; + case ::arrow::Type::HALF_FLOAT: + byte_width = sizeof(::arrow::HalfFloatType::c_type); + break; case ::arrow::Type::DECIMAL128: case ::arrow::Type::DECIMAL256: { const auto& decimal_type = static_cast(values_type); @@ -537,6 +543,9 @@ static std::shared_ptr MakeSimpleSchema(const DataType& type, case ::arrow::Type::FIXED_SIZE_BINARY: byte_width = static_cast(type).byte_width(); break; + case ::arrow::Type::HALF_FLOAT: + byte_width = sizeof(::arrow::HalfFloatType::c_type); + break; case ::arrow::Type::DECIMAL128: case ::arrow::Type::DECIMAL256: { const auto& decimal_type = static_cast(type); @@ -840,12 +849,12 @@ typedef ::testing::Types< ::arrow::BooleanType, ::arrow::UInt8Type, ::arrow::Int8Type, ::arrow::UInt16Type, ::arrow::Int16Type, ::arrow::Int32Type, ::arrow::UInt64Type, ::arrow::Int64Type, ::arrow::Date32Type, ::arrow::FloatType, ::arrow::DoubleType, ::arrow::StringType, - ::arrow::BinaryType, ::arrow::FixedSizeBinaryType, DecimalWithPrecisionAndScale<1>, - DecimalWithPrecisionAndScale<5>, DecimalWithPrecisionAndScale<10>, - DecimalWithPrecisionAndScale<19>, DecimalWithPrecisionAndScale<23>, - DecimalWithPrecisionAndScale<27>, DecimalWithPrecisionAndScale<38>, - Decimal256WithPrecisionAndScale<39>, Decimal256WithPrecisionAndScale<56>, - Decimal256WithPrecisionAndScale<76>> + ::arrow::BinaryType, ::arrow::FixedSizeBinaryType, ::arrow::HalfFloatType, + DecimalWithPrecisionAndScale<1>, DecimalWithPrecisionAndScale<5>, + DecimalWithPrecisionAndScale<10>, DecimalWithPrecisionAndScale<19>, + DecimalWithPrecisionAndScale<23>, DecimalWithPrecisionAndScale<27>, + DecimalWithPrecisionAndScale<38>, Decimal256WithPrecisionAndScale<39>, + Decimal256WithPrecisionAndScale<56>, Decimal256WithPrecisionAndScale<76>> TestTypes; TYPED_TEST_SUITE(TestParquetIO, TestTypes); @@ -916,9 +925,15 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) { } TYPED_TEST(TestParquetIO, SingleColumnOptionalDictionaryWrite) { - // Skip tests for BOOL as we don't create dictionaries for it. - if (TypeParam::type_id == ::arrow::Type::BOOL) { - return; + switch (TypeParam::type_id) { + case ::arrow::Type::BOOL: + GTEST_SKIP() << "dictionaries not created for BOOL"; + break; + case ::arrow::Type::HALF_FLOAT: + GTEST_SKIP() << "dictionary_encode not supported for HALF_FLOAT"; + break; + default: + break; } std::shared_ptr values; diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index f11101eb24298..5443214f930d7 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -236,6 +236,8 @@ TEST_F(TestConvertParquetSchema, ParquetAnnotatedFields) { ::arrow::fixed_size_binary(12)}, {"uuid", LogicalType::UUID(), ParquetType::FIXED_LEN_BYTE_ARRAY, 16, ::arrow::fixed_size_binary(16)}, + {"float16", LogicalType::Float16(), ParquetType::FIXED_LEN_BYTE_ARRAY, 2, + ::arrow::float16()}, {"none", LogicalType::None(), ParquetType::BOOLEAN, -1, ::arrow::boolean()}, {"none", LogicalType::None(), ParquetType::INT32, -1, ::arrow::int32()}, {"none", LogicalType::None(), ParquetType::INT64, -1, ::arrow::int64()}, @@ -851,6 +853,8 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { ParquetType::FIXED_LEN_BYTE_ARRAY, 7}, {"decimal(32, 8)", ::arrow::decimal(32, 8), LogicalType::Decimal(32, 8), ParquetType::FIXED_LEN_BYTE_ARRAY, 14}, + {"float16", ::arrow::float16(), LogicalType::Float16(), + ParquetType::FIXED_LEN_BYTE_ARRAY, 2}, {"time32", ::arrow::time32(::arrow::TimeUnit::MILLI), LogicalType::Time(true, LogicalType::TimeUnit::MILLIS), ParquetType::INT32, -1}, {"time64(microsecond)", ::arrow::time64(::arrow::TimeUnit::MICRO), @@ -913,7 +917,8 @@ TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) { }; std::vector cases = { - {"float16", ::arrow::float16()}, + {"run_end_encoded", + ::arrow::run_end_encoded(::arrow::int32(), ::arrow::list(::arrow::int8()))}, }; for (const FieldConstructionArguments& c : cases) { diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 5146aa12c2c36..e5aef5a45b5f3 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -42,6 +42,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/endian.h" +#include "arrow/util/float16.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" @@ -82,6 +83,7 @@ using ::arrow::bit_util::FromBigEndian; using ::arrow::internal::checked_cast; using ::arrow::internal::checked_pointer_cast; using ::arrow::internal::SafeLeftShift; +using ::arrow::util::Float16; using ::arrow::util::SafeLoadAs; using parquet::internal::BinaryRecordReader; @@ -713,6 +715,17 @@ Status TransferDecimal(RecordReader* reader, MemoryPool* pool, return Status::OK(); } +Status TransferHalfFloat(RecordReader* reader, MemoryPool* pool, + const std::shared_ptr& field, Datum* out) { + static const auto binary_type = ::arrow::fixed_size_binary(2); + // Read as a FixedSizeBinaryArray - then, view as a HalfFloatArray + std::shared_ptr chunked_array; + RETURN_NOT_OK( + TransferBinary(reader, pool, field->WithType(binary_type), &chunked_array)); + ARROW_ASSIGN_OR_RAISE(*out, chunked_array->View(field->type())); + return Status::OK(); +} + } // namespace #define TRANSFER_INT32(ENUM, ArrowType) \ @@ -772,6 +785,18 @@ Status TransferColumnData(RecordReader* reader, const std::shared_ptr& va RETURN_NOT_OK(TransferBinary(reader, pool, value_field, &chunked_result)); result = chunked_result; } break; + case ::arrow::Type::HALF_FLOAT: { + const auto& type = *value_field->type(); + if (descr->physical_type() != ::parquet::Type::FIXED_LEN_BYTE_ARRAY) { + return Status::Invalid("Physical type for ", type.ToString(), + " must be fixed length binary"); + } + if (descr->type_length() != type.byte_width()) { + return Status::Invalid("Fixed length binary type for ", type.ToString(), + " must have a byte width of ", type.byte_width()); + } + RETURN_NOT_OK(TransferHalfFloat(reader, pool, value_field, &result)); + } break; case ::arrow::Type::DECIMAL128: { switch (descr->physical_type()) { case ::parquet::Type::INT32: { diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 3323b7ff8b608..f5484f131eb07 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -397,6 +397,11 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, case ArrowTypeId::DURATION: type = ParquetType::INT64; break; + case ArrowTypeId::HALF_FLOAT: + type = ParquetType::FIXED_LEN_BYTE_ARRAY; + logical_type = LogicalType::Float16(); + length = sizeof(uint16_t); + break; case ArrowTypeId::STRUCT: { auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type()); return StructToNode(struct_type, name, field->nullable(), field_id, properties, diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index da0427cb31000..bb75cce084097 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -130,6 +130,8 @@ Result> FromFLBA(const LogicalType& logical_type, switch (logical_type.type()) { case LogicalType::Type::DECIMAL: return MakeArrowDecimal(logical_type); + case LogicalType::Type::FLOAT16: + return ::arrow::float16(); case LogicalType::Type::NONE: case LogicalType::Type::INTERVAL: case LogicalType::Type::UUID: diff --git a/cpp/src/parquet/arrow/test_util.h b/cpp/src/parquet/arrow/test_util.h index 16c03130c9672..b2be1b3c5354d 100644 --- a/cpp/src/parquet/arrow/test_util.h +++ b/cpp/src/parquet/arrow/test_util.h @@ -33,7 +33,9 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "parquet/column_reader.h" +#include "parquet/test_util.h" namespace parquet { @@ -70,7 +72,14 @@ ::arrow::enable_if_floating_point NonNullArray( size_t size, std::shared_ptr* out) { using c_type = typename ArrowType::c_type; std::vector values; - ::arrow::random_real(size, 0, static_cast(0), static_cast(1), &values); + if constexpr (::arrow::is_half_float_type::value) { + values.resize(size); + test::random_float16_numbers(static_cast(size), 0, ::arrow::util::Float16(0.0f), + ::arrow::util::Float16(1.0f), values.data()); + } else { + ::arrow::random_real(size, 0, static_cast(0), static_cast(1), + &values); + } ::arrow::NumericBuilder builder; RETURN_NOT_OK(builder.AppendValues(values.data(), values.size())); return builder.Finish(out); @@ -201,8 +210,14 @@ ::arrow::enable_if_floating_point NullableArray( size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr* out) { using c_type = typename ArrowType::c_type; std::vector values; - ::arrow::random_real(size, seed, static_cast(-1e10), static_cast(1e10), - &values); + if constexpr (::arrow::is_half_float_type::value) { + values.resize(size); + test::random_float16_numbers(static_cast(size), 0, ::arrow::util::Float16(-1e4f), + ::arrow::util::Float16(1e4f), values.data()); + } else { + ::arrow::random_real(size, seed, static_cast(-1e10), + static_cast(1e10), &values); + } std::vector valid_bytes(size, 1); for (size_t i = 0; i < num_nulls; i++) { diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 5dff533c1cce2..a7e7b2f93e174 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -39,6 +39,7 @@ #include "arrow/util/compression.h" #include "arrow/util/crc32.h" #include "arrow/util/endian.h" +#include "arrow/util/float16.h" #include "arrow/util/logging.h" #include "arrow/util/rle_encoding.h" #include "arrow/util/type_traits.h" @@ -65,6 +66,7 @@ using arrow::Status; using arrow::bit_util::BitWriter; using arrow::internal::checked_cast; using arrow::internal::checked_pointer_cast; +using arrow::util::Float16; using arrow::util::RleEncoder; namespace bit_util = arrow::bit_util; @@ -2295,6 +2297,33 @@ struct SerializeFunctor< int64_t* scratch; }; +// ---------------------------------------------------------------------- +// Write Arrow to Float16 + +// Requires a custom serializer because Float16s in Parquet are stored as a 2-byte +// (little-endian) FLBA, whereas in Arrow they're a native `uint16_t`. +template <> +struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> { + Status Serialize(const ::arrow::HalfFloatArray& array, ArrowWriteContext*, FLBA* out) { + const uint16_t* values = array.raw_values(); + if (array.null_count() == 0) { + for (int64_t i = 0; i < array.length(); ++i) { + out[i] = ToFLBA(&values[i]); + } + } else { + for (int64_t i = 0; i < array.length(); ++i) { + out[i] = array.IsValid(i) ? ToFLBA(&values[i]) : FLBA{}; + } + } + return Status::OK(); + } + + private: + FLBA ToFLBA(const uint16_t* value_ptr) const { + return FLBA{reinterpret_cast(value_ptr)}; + } +}; + template <> Status TypedColumnWriterImpl::WriteArrowDense( const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, @@ -2303,6 +2332,7 @@ Status TypedColumnWriterImpl::WriteArrowDense( WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType) WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType) WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType) + WRITE_SERIALIZE_CASE(HALF_FLOAT, HalfFloatType, FLBAType) default: break; } diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 5bfe38522af7b..4db49b4267415 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -21,6 +21,7 @@ #include #include "arrow/io/file.h" +#include "arrow/util/float16.h" #include "parquet/file_reader.h" #include "parquet/metadata.h" #include "parquet/schema.h" @@ -579,6 +580,27 @@ TEST(PageIndex, WriteFLBAColumnIndex) { /*has_null_counts=*/false); } +TEST(PageIndex, WriteFloat16ColumnIndex) { + using ::arrow::util::Float16; + auto encode = [](auto value) { + auto bytes = Float16(value).ToLittleEndian(); + return std::string(reinterpret_cast(bytes.data()), bytes.size()); + }; + + // Float16 (FLBA) values in the ascending order and without null count. + std::vector page_stats(4); + page_stats.at(0).set_min(encode(-1.3)).set_max(encode(+3.6)); + page_stats.at(1).set_min(encode(-0.2)).set_max(encode(+4.5)); + page_stats.at(2).set_min(encode(+1.1)).set_max(encode(+5.4)); + page_stats.at(3).set_min(encode(+2.0)).set_max(encode(+6.3)); + + auto node = schema::PrimitiveNode::Make( + "c1", Repetition::OPTIONAL, LogicalType::Float16(), Type::FIXED_LEN_BYTE_ARRAY, + /*length=*/2); + TestWriteTypedColumnIndex(std::move(node), page_stats, BoundaryOrder::Ascending, + /*has_null_counts=*/false); +} + TEST(PageIndex, WriteColumnIndexWithAllNullPages) { // All values are null. std::vector page_stats(3); diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index 88e44c96cc24c..d802166be66e8 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -234,6 +234,7 @@ struct MapType {} // see LogicalTypes.md struct ListType {} // see LogicalTypes.md struct EnumType {} // allowed for BINARY, must be encoded with UTF-8 struct DateType {} // allowed for INT32 +struct Float16Type{} // allowed for FIXED[2], must encode raw FLOAT16 bytes /** * Logical type to annotate a column that is always null. @@ -344,6 +345,7 @@ union LogicalType { 12: JsonType JSON // use ConvertedType JSON 13: BsonType BSON // use ConvertedType BSON 14: UUIDType UUID // no compatible ConvertedType + 15: Float16Type FLOAT16 // no compatible ConvertedType } /** diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc index 603d9ed8e2124..a1b5557497d9c 100644 --- a/cpp/src/parquet/schema_test.cc +++ b/cpp/src/parquet/schema_test.cc @@ -1147,6 +1147,9 @@ TEST(TestLogicalTypeConstruction, NewTypeIncompatibility) { auto check_is_UUID = [](const std::shared_ptr& logical_type) { return logical_type->is_UUID(); }; + auto check_is_float16 = [](const std::shared_ptr& logical_type) { + return logical_type->is_float16(); + }; auto check_is_null = [](const std::shared_ptr& logical_type) { return logical_type->is_null(); }; @@ -1159,6 +1162,7 @@ TEST(TestLogicalTypeConstruction, NewTypeIncompatibility) { std::vector cases = { {LogicalType::UUID(), check_is_UUID}, + {LogicalType::Float16(), check_is_float16}, {LogicalType::Null(), check_is_null}, {LogicalType::Time(false, LogicalType::TimeUnit::MILLIS), check_is_time}, {LogicalType::Time(false, LogicalType::TimeUnit::MICROS), check_is_time}, @@ -1242,6 +1246,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeProperties) { {JSONLogicalType::Make(), false, true, true}, {BSONLogicalType::Make(), false, true, true}, {UUIDLogicalType::Make(), false, true, true}, + {Float16LogicalType::Make(), false, true, true}, {NoLogicalType::Make(), false, false, true}, }; @@ -1351,7 +1356,8 @@ TEST(TestLogicalTypeOperation, LogicalTypeApplicability) { int physical_length; }; - std::vector inapplicable_types = {{Type::FIXED_LEN_BYTE_ARRAY, 8}, + std::vector inapplicable_types = {{Type::FIXED_LEN_BYTE_ARRAY, 1}, + {Type::FIXED_LEN_BYTE_ARRAY, 8}, {Type::FIXED_LEN_BYTE_ARRAY, 20}, {Type::BOOLEAN, -1}, {Type::INT32, -1}, @@ -1374,6 +1380,12 @@ TEST(TestLogicalTypeOperation, LogicalTypeApplicability) { for (const InapplicableType& t : inapplicable_types) { ASSERT_FALSE(logical_type->is_applicable(t.physical_type, t.physical_length)); } + + logical_type = LogicalType::Float16(); + ASSERT_TRUE(logical_type->is_applicable(Type::FIXED_LEN_BYTE_ARRAY, 2)); + for (const InapplicableType& t : inapplicable_types) { + ASSERT_FALSE(logical_type->is_applicable(t.physical_type, t.physical_length)); + } } TEST(TestLogicalTypeOperation, DecimalLogicalTypeApplicability) { @@ -1531,6 +1543,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeRepresentation) { {LogicalType::JSON(), "JSON", R"({"Type": "JSON"})"}, {LogicalType::BSON(), "BSON", R"({"Type": "BSON"})"}, {LogicalType::UUID(), "UUID", R"({"Type": "UUID"})"}, + {LogicalType::Float16(), "Float16", R"({"Type": "Float16"})"}, {LogicalType::None(), "None", R"({"Type": "None"})"}, }; @@ -1580,6 +1593,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeSortOrder) { {LogicalType::JSON(), SortOrder::UNSIGNED}, {LogicalType::BSON(), SortOrder::UNSIGNED}, {LogicalType::UUID(), SortOrder::UNSIGNED}, + {LogicalType::Float16(), SortOrder::SIGNED}, {LogicalType::None(), SortOrder::UNKNOWN}}; for (const ExpectedSortOrder& c : cases) { @@ -1712,6 +1726,15 @@ TEST(TestSchemaNodeCreation, FactoryExceptions) { ASSERT_ANY_THROW(PrimitiveNode::Make("uuid", Repetition::REQUIRED, UUIDLogicalType::Make(), Type::FIXED_LEN_BYTE_ARRAY, 64)); + + // Incompatible primitive type ... + ASSERT_ANY_THROW(PrimitiveNode::Make("float16", Repetition::REQUIRED, + Float16LogicalType::Make(), Type::BYTE_ARRAY, 2)); + // Incompatible primitive length ... + ASSERT_ANY_THROW(PrimitiveNode::Make("float16", Repetition::REQUIRED, + Float16LogicalType::Make(), + Type::FIXED_LEN_BYTE_ARRAY, 3)); + // Non-positive length argument for fixed length binary ... ASSERT_ANY_THROW(PrimitiveNode::Make("negative_length", Repetition::REQUIRED, NoLogicalType::Make(), Type::FIXED_LEN_BYTE_ARRAY, @@ -1902,6 +1925,9 @@ TEST_F(TestSchemaElementConstruction, SimpleCases) { [this]() { return element_->logicalType.__isset.BSON; }}, {"uuid", LogicalType::UUID(), Type::FIXED_LEN_BYTE_ARRAY, 16, false, ConvertedType::NA, true, [this]() { return element_->logicalType.__isset.UUID; }}, + {"float16", LogicalType::Float16(), Type::FIXED_LEN_BYTE_ARRAY, 2, false, + ConvertedType::NA, true, + [this]() { return element_->logicalType.__isset.FLOAT16; }}, {"none", LogicalType::None(), Type::INT64, -1, false, ConvertedType::NA, false, check_nothing}}; @@ -2238,6 +2264,7 @@ TEST(TestLogicalTypeSerialization, Roundtrips) { {LogicalType::JSON(), Type::BYTE_ARRAY, -1}, {LogicalType::BSON(), Type::BYTE_ARRAY, -1}, {LogicalType::UUID(), Type::FIXED_LEN_BYTE_ARRAY, 16}, + {LogicalType::Float16(), Type::FIXED_LEN_BYTE_ARRAY, 2}, {LogicalType::None(), Type::BOOLEAN, -1}}; for (const AnnotatedPrimitiveNodeFactoryArguments& c : cases) { diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index ccfb69c487d40..37b245e0dd6c2 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -30,6 +30,7 @@ #include "arrow/type_traits.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/float16.h" #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" #include "arrow/visit_data_inline.h" @@ -41,6 +42,7 @@ using arrow::default_memory_pool; using arrow::MemoryPool; using arrow::internal::checked_cast; +using arrow::util::Float16; using arrow::util::SafeCopy; using arrow::util::SafeLoad; @@ -53,6 +55,23 @@ namespace { constexpr int value_length(int value_length, const ByteArray& value) { return value.len; } constexpr int value_length(int type_length, const FLBA& value) { return type_length; } +// Static "constants" for normalizing float16 min/max values. These need to be expressed +// as pointers because `Float16LogicalType` represents an FLBA. +struct Float16Constants { + static constexpr const uint8_t* lowest() { return lowest_.data(); } + static constexpr const uint8_t* max() { return max_.data(); } + static constexpr const uint8_t* positive_zero() { return positive_zero_.data(); } + static constexpr const uint8_t* negative_zero() { return negative_zero_.data(); } + + private: + using Bytes = std::array; + static constexpr Bytes lowest_ = + std::numeric_limits::lowest().ToLittleEndian(); + static constexpr Bytes max_ = std::numeric_limits::max().ToLittleEndian(); + static constexpr Bytes positive_zero_ = (+Float16::FromBits(0)).ToLittleEndian(); + static constexpr Bytes negative_zero_ = (-Float16::FromBits(0)).ToLittleEndian(); +}; + template struct CompareHelper { using T = typename DType::c_type; @@ -277,11 +296,43 @@ template struct CompareHelper : public BinaryLikeCompareHelperBase {}; +template <> +struct CompareHelper { + using T = FLBA; + + static T DefaultMin() { return T{Float16Constants::max()}; } + static T DefaultMax() { return T{Float16Constants::lowest()}; } + + static T Coalesce(T val, T fallback) { + return (val.ptr == nullptr || Float16::FromLittleEndian(val.ptr).is_nan()) ? fallback + : val; + } + + static inline bool Compare(int type_length, const T& a, const T& b) { + const auto lhs = Float16::FromLittleEndian(a.ptr); + const auto rhs = Float16::FromLittleEndian(b.ptr); + // NaN is handled here (same behavior as native float compare) + return lhs < rhs; + } + + static T Min(int type_length, const T& a, const T& b) { + if (a.ptr == nullptr) return b; + if (b.ptr == nullptr) return a; + return Compare(type_length, a, b) ? a : b; + } + + static T Max(int type_length, const T& a, const T& b) { + if (a.ptr == nullptr) return b; + if (b.ptr == nullptr) return a; + return Compare(type_length, a, b) ? b : a; + } +}; + using ::std::optional; template ::arrow::enable_if_t::value, optional>> -CleanStatistic(std::pair min_max) { +CleanStatistic(std::pair min_max, LogicalType::Type::type) { return min_max; } @@ -292,7 +343,7 @@ CleanStatistic(std::pair min_max) { // - If max is -0.0f, replace with 0.0f template ::arrow::enable_if_t::value, optional>> -CleanStatistic(std::pair min_max) { +CleanStatistic(std::pair min_max, LogicalType::Type::type) { T min = min_max.first; T max = min_max.second; @@ -318,25 +369,67 @@ CleanStatistic(std::pair min_max) { return {{min, max}}; } -optional> CleanStatistic(std::pair min_max) { +optional> CleanFloat16Statistic(std::pair min_max) { + FLBA min_flba = min_max.first; + FLBA max_flba = min_max.second; + Float16 min = Float16::FromLittleEndian(min_flba.ptr); + Float16 max = Float16::FromLittleEndian(max_flba.ptr); + + if (min.is_nan() || max.is_nan()) { + return ::std::nullopt; + } + + if (min == std::numeric_limits::max() && + max == std::numeric_limits::lowest()) { + return ::std::nullopt; + } + + if (min.is_zero() && !min.signbit()) { + min_flba = FLBA{Float16Constants::negative_zero()}; + } + if (max.is_zero() && max.signbit()) { + max_flba = FLBA{Float16Constants::positive_zero()}; + } + + return {{min_flba, max_flba}}; +} + +optional> CleanStatistic(std::pair min_max, + LogicalType::Type::type logical_type) { if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) { return ::std::nullopt; } + if (logical_type == LogicalType::Type::FLOAT16) { + return CleanFloat16Statistic(std::move(min_max)); + } return min_max; } optional> CleanStatistic( - std::pair min_max) { + std::pair min_max, LogicalType::Type::type) { if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) { return ::std::nullopt; } return min_max; } +template +struct RebindLogical { + using DType = T; + using c_type = typename DType::c_type; +}; + +template <> +struct RebindLogical { + using DType = FLBAType; + using c_type = DType::c_type; +}; + template -class TypedComparatorImpl : virtual public TypedComparator { +class TypedComparatorImpl + : virtual public TypedComparator::DType> { public: - using T = typename DType::c_type; + using T = typename RebindLogical::c_type; using Helper = CompareHelper; explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {} @@ -384,7 +477,9 @@ class TypedComparatorImpl : virtual public TypedComparator { return {min, max}; } - std::pair GetMinMax(const ::arrow::Array& values) override; + std::pair GetMinMax(const ::arrow::Array& values) override { + ParquetException::NYI(values.type()->ToString()); + } private: int type_length_; @@ -412,12 +507,6 @@ TypedComparatorImpl::GetMinMax(const int32_t* va return {SafeCopy(min), SafeCopy(max)}; } -template -std::pair -TypedComparatorImpl::GetMinMax(const ::arrow::Array& values) { - ParquetException::NYI(values.type()->ToString()); -} - template std::pair GetMinMaxBinaryHelper( const TypedComparatorImpl& comparator, @@ -458,6 +547,16 @@ std::pair TypedComparatorImpl::GetMi return GetMinMaxBinaryHelper(*this, values); } +LogicalType::Type::type LogicalTypeId(const ColumnDescriptor* descr) { + if (const auto& logical_type = descr->logical_type()) { + return logical_type->type(); + } + return LogicalType::Type::NONE; +} +LogicalType::Type::type LogicalTypeId(const Statistics& stats) { + return LogicalTypeId(stats.descr()); +} + template class TypedStatisticsImpl : public TypedStatistics { public: @@ -468,9 +567,9 @@ class TypedStatisticsImpl : public TypedStatistics { : descr_(descr), pool_(pool), min_buffer_(AllocateBuffer(pool_, 0)), - max_buffer_(AllocateBuffer(pool_, 0)) { - auto comp = Comparator::Make(descr); - comparator_ = std::static_pointer_cast>(comp); + max_buffer_(AllocateBuffer(pool_, 0)), + logical_type_(LogicalTypeId(descr_)) { + comparator_ = MakeComparator(descr); TypedStatisticsImpl::Reset(); } @@ -527,9 +626,27 @@ class TypedStatisticsImpl : public TypedStatistics { void IncrementNumValues(int64_t n) override { num_values_ += n; } + static bool IsMeaningfulLogicalType(LogicalType::Type::type type) { + switch (type) { + case LogicalType::Type::FLOAT16: + return true; + default: + return false; + } + } + bool Equals(const Statistics& raw_other) const override { if (physical_type() != raw_other.physical_type()) return false; + const auto other_logical_type = LogicalTypeId(raw_other); + // Only compare against logical types that influence the interpretation of the + // physical type + if (IsMeaningfulLogicalType(logical_type_)) { + if (logical_type_ != other_logical_type) return false; + } else if (IsMeaningfulLogicalType(other_logical_type)) { + return false; + } + const auto& other = checked_cast(raw_other); if (has_min_max_ != other.has_min_max_) return false; @@ -655,6 +772,7 @@ class TypedStatisticsImpl : public TypedStatistics { EncodedStatistics statistics_; std::shared_ptr> comparator_; std::shared_ptr min_buffer_, max_buffer_; + LogicalType::Type::type logical_type_ = LogicalType::Type::NONE; void PlainEncode(const T& src, std::string* dst) const; void PlainDecode(const std::string& src, T* dst) const; @@ -686,7 +804,7 @@ class TypedStatisticsImpl : public TypedStatistics { void SetMinMaxPair(std::pair min_max) { // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN - auto maybe_min_max = CleanStatistic(min_max); + auto maybe_min_max = CleanStatistic(min_max, logical_type_); if (!maybe_min_max) return; auto min = maybe_min_max.value().first; @@ -795,12 +913,8 @@ void TypedStatisticsImpl::PlainDecode(const std::string& src, dst->ptr = reinterpret_cast(src.c_str()); } -} // namespace - -// ---------------------------------------------------------------------- -// Public factory functions - -std::shared_ptr Comparator::Make(Type::type physical_type, +std::shared_ptr DoMakeComparator(Type::type physical_type, + LogicalType::Type::type logical_type, SortOrder::type sort_order, int type_length) { if (SortOrder::SIGNED == sort_order) { @@ -820,6 +934,10 @@ std::shared_ptr Comparator::Make(Type::type physical_type, case Type::BYTE_ARRAY: return std::make_shared>(); case Type::FIXED_LEN_BYTE_ARRAY: + if (logical_type == LogicalType::Type::FLOAT16) { + return std::make_shared>( + type_length); + } return std::make_shared>(type_length); default: ParquetException::NYI("Signed Compare not implemented"); @@ -845,8 +963,21 @@ std::shared_ptr Comparator::Make(Type::type physical_type, return nullptr; } +} // namespace + +// ---------------------------------------------------------------------- +// Public factory functions + +std::shared_ptr Comparator::Make(Type::type physical_type, + SortOrder::type sort_order, + int type_length) { + return DoMakeComparator(physical_type, LogicalType::Type::NONE, sort_order, + type_length); +} + std::shared_ptr Comparator::Make(const ColumnDescriptor* descr) { - return Make(descr->physical_type(), descr->sort_order(), descr->type_length()); + return DoMakeComparator(descr->physical_type(), LogicalTypeId(descr), + descr->sort_order(), descr->type_length()); } std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 637832945ec57..cb2e6455abfa9 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -34,6 +34,7 @@ #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/float16.h" #include "arrow/util/ubsan.h" #include "parquet/column_reader.h" @@ -49,6 +50,7 @@ using arrow::default_memory_pool; using arrow::MemoryPool; +using arrow::util::Float16; using arrow::util::SafeCopy; namespace bit_util = arrow::bit_util; @@ -875,9 +877,22 @@ TEST(CorrectStatistics, Basics) { // Test SortOrder class static const int NUM_VALUES = 10; -template +template +struct RebindLogical { + using ParquetType = T; + using CType = typename T::c_type; +}; + +template <> +struct RebindLogical { + using ParquetType = FLBAType; + using CType = ParquetType::c_type; +}; + +template class TestStatisticsSortOrder : public ::testing::Test { public: + using TestType = typename RebindLogical::ParquetType; using c_type = typename TestType::c_type; void SetUp() override { @@ -955,7 +970,7 @@ class TestStatisticsSortOrder : public ::testing::Test { }; using CompareTestTypes = ::testing::Types; + ByteArrayType, FLBAType, Float16LogicalType>; // TYPE::INT32 template <> @@ -1102,6 +1117,39 @@ void TestStatisticsSortOrder::SetValues() { .set_max(std::string(reinterpret_cast(&vals[8][0]), FLBA_LENGTH)); } +template <> +void TestStatisticsSortOrder::AddNodes(std::string name) { + auto node = + schema::PrimitiveNode::Make(name, Repetition::REQUIRED, LogicalType::Float16(), + Type::FIXED_LEN_BYTE_ARRAY, sizeof(uint16_t)); + fields_.push_back(std::move(node)); +} + +template <> +void TestStatisticsSortOrder::SetValues() { + constexpr int kValueLen = 2; + constexpr int kNumBytes = NUM_VALUES * kValueLen; + + const Float16 f16_vals[NUM_VALUES] = { + Float16::FromFloat(+2.0f), Float16::FromFloat(-4.0f), Float16::FromFloat(+4.0f), + Float16::FromFloat(-2.0f), Float16::FromFloat(-1.0f), Float16::FromFloat(+3.0f), + Float16::FromFloat(+1.0f), Float16::FromFloat(-5.0f), Float16::FromFloat(+0.0f), + Float16::FromFloat(-3.0f), + }; + + values_buf_.resize(kNumBytes); + uint8_t* ptr = values_buf_.data(); + for (int i = 0; i < NUM_VALUES; ++i) { + f16_vals[i].ToLittleEndian(ptr); + values_[i].ptr = ptr; + ptr += kValueLen; + } + + stats_[0] + .set_min(std::string(reinterpret_cast(values_[7].ptr), kValueLen)) + .set_max(std::string(reinterpret_cast(values_[2].ptr), kValueLen)); +} + TYPED_TEST_SUITE(TestStatisticsSortOrder, CompareTestTypes); TYPED_TEST(TestStatisticsSortOrder, MinMax) { @@ -1167,12 +1215,20 @@ TEST_F(TestStatisticsSortOrderFLBA, UnknownSortOrder) { ASSERT_FALSE(cc_metadata->is_stats_set()); } +template +static std::string EncodeValue(const T& val) { + return std::string(reinterpret_cast(&val), sizeof(val)); +} +static std::string EncodeValue(const FLBA& val, int length = sizeof(uint16_t)) { + return std::string(reinterpret_cast(val.ptr), length); +} + template void AssertMinMaxAre(Stats stats, const Array& values, T expected_min, T expected_max) { stats->Update(values.data(), values.size(), 0); ASSERT_TRUE(stats->HasMinMax()); - EXPECT_EQ(stats->min(), expected_min); - EXPECT_EQ(stats->max(), expected_max); + EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min)); + EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max)); } template @@ -1184,8 +1240,8 @@ void AssertMinMaxAre(Stats stats, const Array& values, const uint8_t* valid_bitm stats->UpdateSpaced(values.data(), valid_bitmap, 0, non_null_count + null_count, non_null_count, null_count); ASSERT_TRUE(stats->HasMinMax()); - EXPECT_EQ(stats->min(), expected_min); - EXPECT_EQ(stats->max(), expected_max); + EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min)); + EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max)); } template @@ -1268,50 +1324,225 @@ void CheckExtrema() { TEST(TestStatistic, Int32Extrema) { CheckExtrema(); } TEST(TestStatistic, Int64Extrema) { CheckExtrema(); } -// PARQUET-1225: Float NaN values may lead to incorrect min-max -template -void CheckNaNs() { - using T = typename ParquetType::c_type; +template +class TestFloatStatistics : public ::testing::Test { + public: + using ParquetType = typename RebindLogical::ParquetType; + using c_type = typename ParquetType::c_type; + + void Init(); + void SetUp() override { + this->Init(); + ASSERT_NE(EncodeValue(negative_zero_), EncodeValue(positive_zero_)); + } + + bool signbit(c_type val); + void CheckEq(const c_type& l, const c_type& r); + NodePtr MakeNode(const std::string& name, Repetition::type rep); + + template + void CheckMinMaxZeroesSign(Stats stats, const Values& values) { + stats->Update(values.data(), values.size(), /*null_count=*/0); + ASSERT_TRUE(stats->HasMinMax()); + + this->CheckEq(stats->min(), positive_zero_); + ASSERT_TRUE(this->signbit(stats->min())); + ASSERT_EQ(stats->EncodeMin(), EncodeValue(negative_zero_)); + + this->CheckEq(stats->max(), positive_zero_); + ASSERT_FALSE(this->signbit(stats->max())); + ASSERT_EQ(stats->EncodeMax(), EncodeValue(positive_zero_)); + } + + // ARROW-5562: Ensure that -0.0f and 0.0f values are properly handled like in + // parquet-mr + void TestNegativeZeroes() { + NodePtr node = this->MakeNode("f", Repetition::OPTIONAL); + ColumnDescriptor descr(node, 1, 1); + + { + std::array values{negative_zero_, positive_zero_}; + auto stats = MakeStatistics(&descr); + CheckMinMaxZeroesSign(stats, values); + } + + { + std::array values{positive_zero_, negative_zero_}; + auto stats = MakeStatistics(&descr); + CheckMinMaxZeroesSign(stats, values); + } + + { + std::array values{negative_zero_, negative_zero_}; + auto stats = MakeStatistics(&descr); + CheckMinMaxZeroesSign(stats, values); + } + + { + std::array values{positive_zero_, positive_zero_}; + auto stats = MakeStatistics(&descr); + CheckMinMaxZeroesSign(stats, values); + } + } + + // PARQUET-1225: Float NaN values may lead to incorrect min-max + template + void CheckNaNs(ColumnDescriptor* descr, const Values& all_nans, const Values& some_nans, + const Values& other_nans, c_type min, c_type max, uint8_t valid_bitmap, + uint8_t valid_bitmap_no_nans) { + auto some_nan_stats = MakeStatistics(descr); + // Ingesting only nans should not yield valid min max + AssertUnsetMinMax(some_nan_stats, all_nans); + // Ingesting a mix of NaNs and non-NaNs should yield a valid min max. + AssertMinMaxAre(some_nan_stats, some_nans, min, max); + // Ingesting only nans after a valid min/max, should have no effect + AssertMinMaxAre(some_nan_stats, all_nans, min, max); + + some_nan_stats = MakeStatistics(descr); + AssertUnsetMinMax(some_nan_stats, all_nans, &valid_bitmap); + // NaNs should not pollute min max when excluded via null bitmap. + AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap_no_nans, min, max); + // Ingesting NaNs with a null bitmap should not change the result. + AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap, min, max); + + // An array that doesn't start with NaN + auto other_stats = MakeStatistics(descr); + AssertMinMaxAre(other_stats, other_nans, min, max); + } + + void TestNaNs(); + + protected: + std::vector data_buf_; + c_type positive_zero_; + c_type negative_zero_; +}; + +template +void TestFloatStatistics::Init() { + positive_zero_ = c_type{}; + negative_zero_ = -positive_zero_; +} +template <> +void TestFloatStatistics::Init() { + data_buf_.resize(4); + (+Float16(0)).ToLittleEndian(&data_buf_[0]); + positive_zero_ = FLBA{&data_buf_[0]}; + (-Float16(0)).ToLittleEndian(&data_buf_[2]); + negative_zero_ = FLBA{&data_buf_[2]}; +} + +template +NodePtr TestFloatStatistics::MakeNode(const std::string& name, Repetition::type rep) { + return PrimitiveNode::Make(name, rep, ParquetType::type_num); +} +template <> +NodePtr TestFloatStatistics::MakeNode(const std::string& name, + Repetition::type rep) { + return PrimitiveNode::Make(name, rep, LogicalType::Float16(), + Type::FIXED_LEN_BYTE_ARRAY, 2); +} + +template +void TestFloatStatistics::CheckEq(const c_type& l, const c_type& r) { + ASSERT_EQ(l, r); +} +template <> +void TestFloatStatistics::CheckEq(const c_type& a, const c_type& b) { + auto l = Float16::FromLittleEndian(a.ptr); + auto r = Float16::FromLittleEndian(b.ptr); + ASSERT_EQ(l, r); +} +template +bool TestFloatStatistics::signbit(c_type val) { + return std::signbit(val); +} +template <> +bool TestFloatStatistics::signbit(c_type val) { + return Float16::FromLittleEndian(val.ptr).signbit(); +} + +template +void TestFloatStatistics::TestNaNs() { constexpr int kNumValues = 8; - NodePtr node = PrimitiveNode::Make("f", Repetition::OPTIONAL, ParquetType::type_num); + NodePtr node = this->MakeNode("f", Repetition::OPTIONAL); ColumnDescriptor descr(node, 1, 1); - constexpr T nan = std::numeric_limits::quiet_NaN(); - constexpr T min = -4.0f; - constexpr T max = 3.0f; + constexpr c_type nan = std::numeric_limits::quiet_NaN(); + constexpr c_type min = -4.0f; + constexpr c_type max = 3.0f; + + std::array all_nans{nan, nan, nan, nan, nan, nan, nan, nan}; + std::array some_nans{nan, max, -3.0f, -1.0f, nan, 2.0f, min, nan}; + std::array other_nans{1.5f, max, -3.0f, -1.0f, nan, 2.0f, min, nan}; - std::array all_nans{nan, nan, nan, nan, nan, nan, nan, nan}; - std::array some_nans{nan, max, -3.0f, -1.0f, nan, 2.0f, min, nan}; uint8_t valid_bitmap = 0x7F; // 0b01111111 // NaNs excluded uint8_t valid_bitmap_no_nans = 0x6E; // 0b01101110 - // Test values - auto some_nan_stats = MakeStatistics(&descr); - // Ingesting only nans should not yield valid min max - AssertUnsetMinMax(some_nan_stats, all_nans); - // Ingesting a mix of NaNs and non-NaNs should not yield valid min max. - AssertMinMaxAre(some_nan_stats, some_nans, min, max); - // Ingesting only nans after a valid min/max, should have not effect - AssertMinMaxAre(some_nan_stats, all_nans, min, max); + this->CheckNaNs(&descr, all_nans, some_nans, other_nans, min, max, valid_bitmap, + valid_bitmap_no_nans); +} - some_nan_stats = MakeStatistics(&descr); - AssertUnsetMinMax(some_nan_stats, all_nans, &valid_bitmap); - // NaNs should not pollute min max when excluded via null bitmap. - AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap_no_nans, min, max); - // Ingesting NaNs with a null bitmap should not change the result. - AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap, min, max); +struct BufferedFloat16 { + explicit BufferedFloat16(Float16 f16) : f16(f16) { + this->f16.ToLittleEndian(bytes_.data()); + } + explicit BufferedFloat16(float f) : BufferedFloat16(Float16::FromFloat(f)) {} + const uint8_t* bytes() const { return bytes_.data(); } + + Float16 f16; + std::array bytes_; +}; + +template <> +void TestFloatStatistics::TestNaNs() { + constexpr int kNumValues = 8; + + NodePtr node = this->MakeNode("f", Repetition::OPTIONAL); + ColumnDescriptor descr(node, 1, 1); + + using F16 = BufferedFloat16; + const auto nan_f16 = F16(std::numeric_limits::quiet_NaN()); + const auto min_f16 = F16(-4.0f); + const auto max_f16 = F16(+3.0f); + + const auto min = FLBA{min_f16.bytes()}; + const auto max = FLBA{max_f16.bytes()}; + + std::array all_nans_f16 = {nan_f16, nan_f16, nan_f16, nan_f16, + nan_f16, nan_f16, nan_f16, nan_f16}; + std::array some_nans_f16 = { + nan_f16, max_f16, F16(-3.0f), F16(-1.0f), nan_f16, F16(+2.0f), min_f16, nan_f16}; + std::array other_nans_f16 = some_nans_f16; + other_nans_f16[0] = F16(+1.5f); // +1.5 + + auto prepare_values = [](const auto& values) -> std::vector { + std::vector out(values.size()); + std::transform(values.begin(), values.end(), out.begin(), + [](const F16& f16) { return FLBA{f16.bytes()}; }); + return out; + }; + + auto all_nans = prepare_values(all_nans_f16); + auto some_nans = prepare_values(some_nans_f16); + auto other_nans = prepare_values(other_nans_f16); + + uint8_t valid_bitmap = 0x7F; // 0b01111111 + // NaNs excluded + uint8_t valid_bitmap_no_nans = 0x6E; // 0b01101110 - // An array that doesn't start with NaN - std::array other_nans{1.5f, max, -3.0f, -1.0f, nan, 2.0f, min, nan}; - auto other_stats = MakeStatistics(&descr); - AssertMinMaxAre(other_stats, other_nans, min, max); + this->CheckNaNs(&descr, all_nans, some_nans, other_nans, min, max, valid_bitmap, + valid_bitmap_no_nans); } -TEST(TestStatistic, NaNFloatValues) { CheckNaNs(); } +using FloatingPointTypes = ::testing::Types; + +TYPED_TEST_SUITE(TestFloatStatistics, FloatingPointTypes); -TEST(TestStatistic, NaNDoubleValues) { CheckNaNs(); } +TYPED_TEST(TestFloatStatistics, NegativeZeros) { this->TestNegativeZeroes(); } +TYPED_TEST(TestFloatStatistics, NaNs) { this->TestNaNs(); } // ARROW-7376 TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) { @@ -1327,58 +1558,6 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) { AssertUnsetMinMax(stats, nans_but_last, &all_but_last_valid); } -template -void AssertMinMaxZeroesSign(Stats stats, const Array& values) { - stats->Update(values.data(), values.size(), 0); - ASSERT_TRUE(stats->HasMinMax()); - - T zero{}; - ASSERT_EQ(stats->min(), zero); - ASSERT_TRUE(std::signbit(stats->min())); - - ASSERT_EQ(stats->max(), zero); - ASSERT_FALSE(std::signbit(stats->max())); -} - -// ARROW-5562: Ensure that -0.0f and 0.0f values are properly handled like in -// parquet-mr -template -void CheckNegativeZeroStats() { - using T = typename ParquetType::c_type; - - NodePtr node = PrimitiveNode::Make("f", Repetition::OPTIONAL, ParquetType::type_num); - ColumnDescriptor descr(node, 1, 1); - T zero{}; - - { - std::array values{-zero, zero}; - auto stats = MakeStatistics(&descr); - AssertMinMaxZeroesSign(stats, values); - } - - { - std::array values{zero, -zero}; - auto stats = MakeStatistics(&descr); - AssertMinMaxZeroesSign(stats, values); - } - - { - std::array values{-zero, -zero}; - auto stats = MakeStatistics(&descr); - AssertMinMaxZeroesSign(stats, values); - } - - { - std::array values{zero, zero}; - auto stats = MakeStatistics(&descr); - AssertMinMaxZeroesSign(stats, values); - } -} - -TEST(TestStatistics, FloatNegativeZero) { CheckNegativeZeroStats(); } - -TEST(TestStatistics, DoubleNegativeZero) { CheckNegativeZeroStats(); } - // Test statistics for binary column with UNSIGNED sort order TEST(TestStatisticsSortOrderMinMax, Unsigned) { std::string dir_string(test::get_data_dir()); diff --git a/cpp/src/parquet/test_util.cc b/cpp/src/parquet/test_util.cc index b65945cc7329f..a6fa8afc0f5b3 100644 --- a/cpp/src/parquet/test_util.cc +++ b/cpp/src/parquet/test_util.cc @@ -101,6 +101,16 @@ void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_v } } +void random_float16_numbers(int n, uint32_t seed, ::arrow::util::Float16 min_value, + ::arrow::util::Float16 max_value, uint16_t* out) { + std::vector values(n); + random_numbers(n, seed, static_cast(min_value), static_cast(max_value), + values.data()); + for (int i = 0; i < n; ++i) { + out[i] = ::arrow::util::Float16(values[i]).bits(); + } +} + void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out) { std::default_random_engine gen(seed); std::uniform_int_distribution d(0, 255); diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index c8578609e9b1d..59728cf53f699 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -33,6 +33,7 @@ #include "arrow/io/memory.h" #include "arrow/testing/util.h" +#include "arrow/util/float16.h" #include "parquet/column_page.h" #include "parquet/column_reader.h" @@ -148,6 +149,9 @@ inline void random_numbers(int n, uint32_t seed, double min_value, double max_va void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value, Int96* out); +void random_float16_numbers(int n, uint32_t seed, ::arrow::util::Float16 min_value, + ::arrow::util::Float16 max_value, uint16_t* out); + void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out); void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size, diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 3127b60e5d1ae..04a0fc2e0117b 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -441,6 +441,8 @@ std::shared_ptr LogicalType::FromThrift( return BSONLogicalType::Make(); } else if (type.__isset.UUID) { return UUIDLogicalType::Make(); + } else if (type.__isset.FLOAT16) { + return Float16LogicalType::Make(); } else { throw ParquetException("Metadata contains Thrift LogicalType that is not recognized"); } @@ -494,6 +496,10 @@ std::shared_ptr LogicalType::BSON() { return BSONLogicalType: std::shared_ptr LogicalType::UUID() { return UUIDLogicalType::Make(); } +std::shared_ptr LogicalType::Float16() { + return Float16LogicalType::Make(); +} + std::shared_ptr LogicalType::None() { return NoLogicalType::Make(); } /* @@ -575,6 +581,7 @@ class LogicalType::Impl { class JSON; class BSON; class UUID; + class Float16; class No; class Undefined; @@ -644,6 +651,9 @@ bool LogicalType::is_null() const { return impl_->type() == LogicalType::Type::N bool LogicalType::is_JSON() const { return impl_->type() == LogicalType::Type::JSON; } bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; } bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; } +bool LogicalType::is_float16() const { + return impl_->type() == LogicalType::Type::FLOAT16; +} bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; } bool LogicalType::is_valid() const { return impl_->type() != LogicalType::Type::UNDEFINED; @@ -1557,6 +1567,22 @@ class LogicalType::Impl::UUID final : public LogicalType::Impl::Incompatible, GENERATE_MAKE(UUID) +class LogicalType::Impl::Float16 final : public LogicalType::Impl::Incompatible, + public LogicalType::Impl::TypeLengthApplicable { + public: + friend class Float16LogicalType; + + OVERRIDE_TOSTRING(Float16) + OVERRIDE_TOTHRIFT(Float16Type, FLOAT16) + + private: + Float16() + : LogicalType::Impl(LogicalType::Type::FLOAT16, SortOrder::SIGNED), + LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 2) {} +}; + +GENERATE_MAKE(Float16) + class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible, public LogicalType::Impl::UniversalApplicable { public: diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 0315376a883e9..76dd0efc7cb4a 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -157,6 +157,7 @@ class PARQUET_EXPORT LogicalType { JSON, BSON, UUID, + FLOAT16, NONE // Not a real logical type; should always be last element }; }; @@ -210,6 +211,7 @@ class PARQUET_EXPORT LogicalType { static std::shared_ptr JSON(); static std::shared_ptr BSON(); static std::shared_ptr UUID(); + static std::shared_ptr Float16(); /// \brief Create a placeholder for when no logical type is specified static std::shared_ptr None(); @@ -263,6 +265,7 @@ class PARQUET_EXPORT LogicalType { bool is_JSON() const; bool is_BSON() const; bool is_UUID() const; + bool is_float16() const; bool is_none() const; /// \brief Return true if this logical type is of a known type. bool is_valid() const; @@ -433,6 +436,16 @@ class PARQUET_EXPORT UUIDLogicalType : public LogicalType { UUIDLogicalType() = default; }; +/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 2, +/// must encode raw FLOAT16 bytes. +class PARQUET_EXPORT Float16LogicalType : public LogicalType { + public: + static std::shared_ptr Make(); + + private: + Float16LogicalType() = default; +}; + /// \brief Allowed for any physical type. class PARQUET_EXPORT NoLogicalType : public LogicalType { public: diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 23fca8fd73010..3e06352f5dde3 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -481,6 +481,8 @@ physical type. +-------------------+-----------------------------+----------------------------+---------+ | MAP | Any | Map | \(6) | +-------------------+-----------------------------+----------------------------+---------+ +| FLOAT16 | FIXED_LENGTH_BYTE_ARRAY | HalfFloat | | ++-------------------+-----------------------------+----------------------------+---------+ * \(1) On the write side, the Parquet physical type INT32 is generated. From d076c69e81e5d331bae214a3cf9fabedb17752fa Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 15 Nov 2023 16:15:42 +0100 Subject: [PATCH 12/23] GH-38676: [Python] Fix potential deadlock when CSV reading errors out (#38713) ### Rationale for this change A deadlock can happen in a C++ destructor in the following case: * the C++ destructor is called from Python, holding the GIL * the C++ destructor waits for a threaded task to finish * the threaded task has invoked some Python code which is waiting to acquire the GIL ### What changes are included in this PR? To reliably present such a deadlock, introduce `std::shared_ptr` and `std::unique_ptr` wrappers that release the GIL when deallocating the embedded pointer. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #38676 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- python/pyarrow/_csv.pyx | 5 +- python/pyarrow/_dataset.pxd | 8 +-- python/pyarrow/_dataset.pyx | 4 +- python/pyarrow/_parquet.pyx | 6 +-- python/pyarrow/includes/libarrow_python.pxd | 8 +++ python/pyarrow/ipc.pxi | 2 +- python/pyarrow/lib.pxd | 4 +- python/pyarrow/src/arrow/python/common.h | 55 +++++++++++++++++++-- python/pyarrow/tests/test_csv.py | 21 ++++++++ 9 files changed, 93 insertions(+), 20 deletions(-) diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index e532d8d8ab22a..508488c0c3b3c 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -26,8 +26,7 @@ from collections.abc import Mapping from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * -from pyarrow.includes.libarrow_python cimport (MakeInvalidRowHandler, - PyInvalidRowCallback) +from pyarrow.includes.libarrow_python cimport * from pyarrow.lib cimport (check_status, Field, MemoryPool, Schema, RecordBatchReader, ensure_type, maybe_unbox_memory_pool, get_input_stream, @@ -1251,7 +1250,7 @@ def read_csv(input_file, read_options=None, parse_options=None, CCSVParseOptions c_parse_options CCSVConvertOptions c_convert_options CIOContext io_context - shared_ptr[CCSVReader] reader + SharedPtrNoGIL[CCSVReader] reader shared_ptr[CTable] table _get_reader(input_file, read_options, &stream) diff --git a/python/pyarrow/_dataset.pxd b/python/pyarrow/_dataset.pxd index 210e5558009ec..bee9fc1f0987a 100644 --- a/python/pyarrow/_dataset.pxd +++ b/python/pyarrow/_dataset.pxd @@ -31,7 +31,7 @@ cdef CFileSource _make_file_source(object file, FileSystem filesystem=*) cdef class DatasetFactory(_Weakrefable): cdef: - shared_ptr[CDatasetFactory] wrapped + SharedPtrNoGIL[CDatasetFactory] wrapped CDatasetFactory* factory cdef init(self, const shared_ptr[CDatasetFactory]& sp) @@ -45,7 +45,7 @@ cdef class DatasetFactory(_Weakrefable): cdef class Dataset(_Weakrefable): cdef: - shared_ptr[CDataset] wrapped + SharedPtrNoGIL[CDataset] wrapped CDataset* dataset public dict _scan_options @@ -59,7 +59,7 @@ cdef class Dataset(_Weakrefable): cdef class Scanner(_Weakrefable): cdef: - shared_ptr[CScanner] wrapped + SharedPtrNoGIL[CScanner] wrapped CScanner* scanner cdef void init(self, const shared_ptr[CScanner]& sp) @@ -122,7 +122,7 @@ cdef class FileWriteOptions(_Weakrefable): cdef class Fragment(_Weakrefable): cdef: - shared_ptr[CFragment] wrapped + SharedPtrNoGIL[CFragment] wrapped CFragment* fragment cdef void init(self, const shared_ptr[CFragment]& sp) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 48ee676915311..d7d69965d000a 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -3227,7 +3227,7 @@ cdef class RecordBatchIterator(_Weakrefable): object iterator_owner # Iterator is a non-POD type and Cython uses offsetof, leading # to a compiler warning unless wrapped like so - shared_ptr[CRecordBatchIterator] iterator + SharedPtrNoGIL[CRecordBatchIterator] iterator def __init__(self): _forbid_instantiation(self.__class__, subclasses_instead=False) @@ -3273,7 +3273,7 @@ cdef class TaggedRecordBatchIterator(_Weakrefable): """An iterator over a sequence of record batches with fragments.""" cdef: object iterator_owner - shared_ptr[CTaggedRecordBatchIterator] iterator + SharedPtrNoGIL[CTaggedRecordBatchIterator] iterator def __init__(self): _forbid_instantiation(self.__class__, subclasses_instead=False) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 48091367b2ff8..089ed7c75ce58 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -24,6 +24,7 @@ import warnings from cython.operator cimport dereference as deref from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_python cimport * from pyarrow.lib cimport (_Weakrefable, Buffer, Schema, check_status, MemoryPool, maybe_unbox_memory_pool, @@ -1165,7 +1166,7 @@ cdef class ParquetReader(_Weakrefable): cdef: object source CMemoryPool* pool - unique_ptr[FileReader] reader + UniquePtrNoGIL[FileReader] reader FileMetaData _metadata shared_ptr[CRandomAccessFile] rd_handle @@ -1334,7 +1335,7 @@ cdef class ParquetReader(_Weakrefable): vector[int] c_row_groups vector[int] c_column_indices shared_ptr[CRecordBatch] record_batch - unique_ptr[CRecordBatchReader] recordbatchreader + UniquePtrNoGIL[CRecordBatchReader] recordbatchreader self.set_batch_size(batch_size) @@ -1366,7 +1367,6 @@ cdef class ParquetReader(_Weakrefable): check_status( recordbatchreader.get().ReadNext(&record_batch) ) - if record_batch.get() == NULL: break diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index 4d109fc660e08..b8a3041796f97 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -261,6 +261,14 @@ cdef extern from "arrow/python/common.h" namespace "arrow::py": void RestorePyError(const CStatus& status) except * +cdef extern from "arrow/python/common.h" namespace "arrow::py" nogil: + cdef cppclass SharedPtrNoGIL[T](shared_ptr[T]): + # This looks like the only way to satsify both Cython 2 and Cython 3 + SharedPtrNoGIL& operator=(...) + cdef cppclass UniquePtrNoGIL[T, DELETER=*](unique_ptr[T, DELETER]): + UniquePtrNoGIL& operator=(...) + + cdef extern from "arrow/python/inference.h" namespace "arrow::py": c_bool IsPyBool(object o) c_bool IsPyInt(object o) diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index fcb9eb729ef04..5d20a4f8b72cb 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -977,7 +977,7 @@ cdef _wrap_record_batch_with_metadata(CRecordBatchWithMetadata c): cdef class _RecordBatchFileReader(_Weakrefable): cdef: - shared_ptr[CRecordBatchFileReader] reader + SharedPtrNoGIL[CRecordBatchFileReader] reader shared_ptr[CRandomAccessFile] file CIpcReadOptions options diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 63ebe6aea8233..ae197eca1ca6b 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -552,12 +552,12 @@ cdef class CompressedOutputStream(NativeFile): cdef class _CRecordBatchWriter(_Weakrefable): cdef: - shared_ptr[CRecordBatchWriter] writer + SharedPtrNoGIL[CRecordBatchWriter] writer cdef class RecordBatchReader(_Weakrefable): cdef: - shared_ptr[CRecordBatchReader] reader + SharedPtrNoGIL[CRecordBatchReader] reader cdef class Codec(_Weakrefable): diff --git a/python/pyarrow/src/arrow/python/common.h b/python/pyarrow/src/arrow/python/common.h index bc567ef78e83a..4a7886695eadb 100644 --- a/python/pyarrow/src/arrow/python/common.h +++ b/python/pyarrow/src/arrow/python/common.h @@ -19,6 +19,7 @@ #include #include +#include #include #include "arrow/buffer.h" @@ -134,13 +135,15 @@ class ARROW_PYTHON_EXPORT PyAcquireGIL { // A RAII-style helper that releases the GIL until the end of a lexical block class ARROW_PYTHON_EXPORT PyReleaseGIL { public: - PyReleaseGIL() { saved_state_ = PyEval_SaveThread(); } - - ~PyReleaseGIL() { PyEval_RestoreThread(saved_state_); } + PyReleaseGIL() : ptr_(PyEval_SaveThread(), &unique_ptr_deleter) {} private: - PyThreadState* saved_state_; - ARROW_DISALLOW_COPY_AND_ASSIGN(PyReleaseGIL); + static void unique_ptr_deleter(PyThreadState* state) { + if (state) { + PyEval_RestoreThread(state); + } + } + std::unique_ptr ptr_; }; // A helper to call safely into the Python interpreter from arbitrary C++ code. @@ -238,6 +241,48 @@ class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef { } }; +template