From 7d96d9434028fb4d5fcf677d93dd584284cc713c Mon Sep 17 00:00:00 2001 From: ViggoC Date: Fri, 20 Sep 2024 13:36:22 +0800 Subject: [PATCH] GH-39982: [Java] Add RunEndEncodedVector (#43888) ### Rationale for this change ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? * GitHub Issue: #39982 Lead-authored-by: ViggoC Co-authored-by: chenweiguo.vc Co-authored-by: David Li Signed-off-by: David Li --- .../binder/ColumnBinderArrowTypeVisitor.java | 5 + .../arrow/c/BufferImportTypeVisitor.java | 5 + .../jdbc/utils/AvaticaParameterBinder.java | 6 + .../arrow/driver/jdbc/utils/ConvertUtils.java | 6 + .../src/main/codegen/data/ArrowTypes.tdd | 5 + .../main/codegen/templates/UnionReader.java | 2 +- .../org/apache/arrow/vector/TypeLayout.java | 11 + .../vector/compare/RangeEqualsVisitor.java | 53 ++ .../vector/compare/TypeEqualsVisitor.java | 6 + .../arrow/vector/compare/VectorVisitor.java | 6 + .../vector/complex/RunEndEncodedVector.java | 684 ++++++++++++++++++ .../arrow/vector/extension/OpaqueType.java | 5 + .../org/apache/arrow/vector/types/Types.java | 20 + .../validate/ValidateVectorBufferVisitor.java | 34 + .../validate/ValidateVectorTypeVisitor.java | 12 + .../arrow/vector/TestRunEndEncodedVector.java | 231 ++++++ .../vector/validate/TestValidateVector.java | 38 + 17 files changed, 1128 insertions(+), 1 deletion(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java index cb8e43035d33b..a3d615a7e1958 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java @@ -96,6 +96,11 @@ public ColumnBinder visit(ArrowType.Union type) { throw new UnsupportedOperationException("No column binder implemented for type " + type); } + @Override + public ColumnBinder visit(ArrowType.RunEndEncoded type) { + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + @Override public ColumnBinder visit(ArrowType.Map type) { return new MapBinder((MapVector) vector); diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index e47d27bf091ee..150c11e41edff 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -185,6 +185,11 @@ public List visit(ArrowType.Union type) { } } + @Override + public List visit(ArrowType.RunEndEncoded type) { + throw new UnsupportedOperationException("Importing buffers for type: " + type); + } + @Override public List visit(ArrowType.Map type) { return Arrays.asList(maybeImportBitmap(type), importOffsets(type, MapVector.OFFSET_WIDTH)); diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java index 232fa1524088b..4c2a9b865f141 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java @@ -281,5 +281,11 @@ public Boolean visit(ArrowType.ListView type) { public Boolean visit(ArrowType.LargeListView type) { throw new UnsupportedOperationException("Binding is not yet supported for type " + type); } + + @Override + public Boolean visit(ArrowType.RunEndEncoded type) { + throw new UnsupportedOperationException( + "No Avatica parameter binder implemented for type " + type); + } } } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java index ea57aeb774c0a..17b0f42dc7111 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java @@ -284,5 +284,11 @@ public AvaticaParameter visit(ArrowType.LargeListView type) { throw new UnsupportedOperationException( "AvaticaParameter not yet supported for type " + type); } + + @Override + public AvaticaParameter visit(ArrowType.RunEndEncoded type) { + throw new UnsupportedOperationException( + "No Avatica parameter binder implemented for type " + type); + } } } diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index d0e8ef1e358ea..5a0b30e47ee52 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -139,6 +139,11 @@ name: "LargeListView", fields: [], complex: true + }, + { + name: "RunEndEncoded", + fields: [], + complex: true } ] } diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index d2b2f4bb70975..68e30ef48846b 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -39,7 +39,7 @@ @SuppressWarnings("unused") public class UnionReader extends AbstractFieldReader { - private static final int NUM_SUPPORTED_TYPES = 50; + private static final int NUM_SUPPORTED_TYPES = 51; private BaseReader[] readers = new BaseReader[NUM_SUPPORTED_TYPES]; public UnionVector data; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index 78a3cac020a8c..fa75ef04577a3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -40,6 +40,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.LargeUtf8; import org.apache.arrow.vector.types.pojo.ArrowType.Map; import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded; import org.apache.arrow.vector.types.pojo.ArrowType.Struct; import org.apache.arrow.vector.types.pojo.ArrowType.Time; import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; @@ -280,6 +281,11 @@ public TypeLayout visit(Interval type) { public TypeLayout visit(Duration type) { return newFixedWidthTypeLayout(BufferLayout.dataBuffer(64)); } + + @Override + public TypeLayout visit(RunEndEncoded type) { + return new TypeLayout(Collections.emptyList()); + } }); return layout; } @@ -444,6 +450,11 @@ public Integer visit(Interval type) { public Integer visit(Duration type) { return FIXED_WIDTH_BUFFER_COUNT; } + + @Override + public Integer visit(RunEndEncoded type) { + return 0; + } }); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index ed51f748af577..abcf312c5ecfc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.ExtensionTypeVector; +import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.BaseLargeRepeatedValueViewVector; @@ -41,11 +42,13 @@ import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; /** Visitor to compare a range of values for vectors. */ public class RangeEqualsVisitor implements VectorVisitor { + private ValueVector left; private ValueVector right; @@ -226,6 +229,14 @@ public Boolean visit(NullVector left, Range range) { return true; } + @Override + public Boolean visit(RunEndEncodedVector left, Range range) { + if (!validate(left)) { + return false; + } + return compareRunEndEncodedVectors(range); + } + @Override public Boolean visit(ExtensionTypeVector left, Range range) { if (!(right instanceof ExtensionTypeVector) || !validate(left)) { @@ -255,6 +266,48 @@ public Boolean visit(LargeListViewVector left, Range range) { return compareLargeListViewVectors(range); } + protected boolean compareRunEndEncodedVectors(Range range) { + RunEndEncodedVector leftVector = (RunEndEncodedVector) left; + RunEndEncodedVector rightVector = (RunEndEncodedVector) right; + + final int leftRangeEnd = range.getLeftStart() + range.getLength(); + final int rightRangeEnd = range.getRightStart() + range.getLength(); + + FieldVector leftValuesVector = leftVector.getValuesVector(); + FieldVector rightValuesVector = rightVector.getValuesVector(); + + RangeEqualsVisitor innerVisitor = createInnerVisitor(leftValuesVector, rightValuesVector, null); + + int leftLogicalIndex = range.getLeftStart(); + int rightLogicalIndex = range.getRightStart(); + + while (leftLogicalIndex < leftRangeEnd) { + // TODO: implement it more efficient + // https://github.com/apache/arrow/issues/44157 + int leftPhysicalIndex = leftVector.getPhysicalIndex(leftLogicalIndex); + int rightPhysicalIndex = rightVector.getPhysicalIndex(rightLogicalIndex); + if (leftValuesVector.accept( + innerVisitor, new Range(leftPhysicalIndex, rightPhysicalIndex, 1))) { + int leftRunEnd = leftVector.getRunEnd(leftLogicalIndex); + int rightRunEnd = rightVector.getRunEnd(rightLogicalIndex); + + int leftRunLength = Math.min(leftRunEnd, leftRangeEnd) - leftLogicalIndex; + int rightRunLength = Math.min(rightRunEnd, rightRangeEnd) - rightLogicalIndex; + + if (leftRunLength != rightRunLength) { + return false; + } else { + leftLogicalIndex = leftRunEnd; + rightLogicalIndex = rightRunEnd; + } + } else { + return false; + } + } + + return true; + } + protected RangeEqualsVisitor createInnerVisitor( ValueVector leftInner, ValueVector rightInner, diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java index ce92b22ef61c9..30b2f511a0445 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java @@ -32,6 +32,7 @@ import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.types.pojo.Field; @@ -136,6 +137,11 @@ public Boolean visit(LargeListViewVector left, Void value) { return compareField(left.getField(), right.getField()); } + @Override + public Boolean visit(RunEndEncodedVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + private boolean compareField(Field leftField, Field rightField) { if (leftField == rightField) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java index e20f8cd9cfba5..989c57a0c93d0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; import org.apache.arrow.vector.complex.UnionVector; /** @@ -71,4 +72,9 @@ default OUT visit(LargeListViewVector left, IN value) { throw new UnsupportedOperationException( "VectorVisitor for LargeListViewVector is not supported."); } + + default OUT visit(RunEndEncodedVector left, IN value) { + throw new UnsupportedOperationException( + "VectorVisitor for LargeListViewVector is not supported."); + }; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java new file mode 100644 index 0000000000000..e8de86f6e9549 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java @@ -0,0 +1,684 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import static org.apache.arrow.util.Preconditions.checkArgument; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.vector.BaseIntVector; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.TransferPair; + +/** + * A run-end encoded vector contains only two child vectors: a run_end vector of type int and a + * values vector of any type. There are no buffers associated with the parent vector. + */ +public class RunEndEncodedVector extends BaseValueVector implements FieldVector { + public static final FieldVector DEFAULT_VALUE_VECTOR = ZeroVector.INSTANCE; + public static final FieldVector DEFAULT_RUN_END_VECTOR = ZeroVector.INSTANCE; + + public static RunEndEncodedVector empty(String name, BufferAllocator allocator) { + return new RunEndEncodedVector( + name, allocator, FieldType.notNullable(ArrowType.RunEndEncoded.INSTANCE), null); + } + + protected final CallBack callBack; + protected Field field; + protected FieldVector runEndsVector; + protected FieldVector valuesVector; + protected int valueCount; + + /** + * Constructs a new instance. + * + * @param name The name of the instance. + * @param allocator The allocator to use for allocating/reallocating buffers. + * @param fieldType The type of the array that is run-end encoded. + * @param callBack A schema change callback. + */ + public RunEndEncodedVector( + String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { + this(new Field(name, fieldType, null), allocator, callBack); + } + + /** + * Constructs a new instance. + * + * @param field The field materialized by this vector. + * @param allocator The allocator to use for allocating/reallocating buffers. + * @param callBack A schema change callback. + */ + public RunEndEncodedVector(Field field, BufferAllocator allocator, CallBack callBack) { + this(field, allocator, DEFAULT_RUN_END_VECTOR, DEFAULT_VALUE_VECTOR, callBack); + } + + /** + * Constructs a new instance. + * + * @param field The field materialized by this vector. + * @param allocator The allocator to use for allocating/reallocating buffers. + * @param runEndsVector The vector represents run ends. Only Zero vector or type int vector with + * size 16, 32 is allowed + * @param valuesVector The vector represents values + * @param callBack A schema change callback. + */ + public RunEndEncodedVector( + Field field, + BufferAllocator allocator, + FieldVector runEndsVector, + FieldVector valuesVector, + CallBack callBack) { + super(allocator); + this.field = field; + this.callBack = callBack; + this.valueCount = 0; + this.runEndsVector = runEndsVector; + this.valuesVector = valuesVector; + } + + /** ValueVector interface */ + + /** + * Allocate new buffers. ValueVector implements logic to determine how much to allocate. + * + * @throws OutOfMemoryException Thrown if no memory can be allocated. + */ + @Override + public void allocateNew() throws OutOfMemoryException { + if (!allocateNewSafe()) { + throw new OutOfMemoryException("Failure while allocating memory"); + } + } + + /** + * Allocates new buffers. ValueVector implements logic to determine how much to allocate. + * + * @return Returns true if allocation was successful. + */ + @Override + public boolean allocateNewSafe() { + initializeChildrenFromFields(field.getChildren()); + for (FieldVector v : getChildrenFromFields()) { + boolean isAllocated = v.allocateNewSafe(); + if (!isAllocated) { + v.clear(); + return false; + } + } + return true; + } + + /** + * Allocate new buffer with double capacity, and copy data into the new buffer. Replace vector's + * buffer with new buffer, and release old one + */ + @Override + public void reAlloc() { + for (FieldVector v : getChildrenFromFields()) { + v.reAlloc(); + } + } + + @Override + public BufferAllocator getAllocator() { + return allocator; + } + + @Override + protected FieldReader getReaderImpl() { + throw new UnsupportedOperationException("Not yet implemented."); + } + + /** + * Set the initial record capacity. + * + * @param numRecords the initial record capacity. + */ + @Override + public void setInitialCapacity(int numRecords) {} + + /** + * Returns the maximum number of values that can be stored in this vector instance. + * + * @return the maximum number of values that can be stored in this vector instance. + */ + @Override + public int getValueCapacity() { + return getChildrenFromFields().stream() + .mapToInt(item -> item != null ? item.getValueCapacity() : 0) + .min() + .orElseThrow(NoSuchElementException::new); + } + + /** Alternative to clear(). Allows use as an AutoCloseable in try-with-resources. */ + @Override + public void close() { + for (FieldVector v : getChildrenFromFields()) { + v.close(); + } + } + + /** + * Release any owned ArrowBuf and reset the ValueVector to the initial state. If the vector has + * any child vectors, they will also be cleared. + */ + @Override + public void clear() { + for (FieldVector v : getChildrenFromFields()) { + v.clear(); + } + } + + /** + * Reset the ValueVector to the initial state without releasing any owned ArrowBuf. Buffer + * capacities will remain unchanged and any previous data will be zeroed out. This includes + * buffers for data, validity, offset, etc. If the vector has any child vectors, they will also be + * reset. + */ + @Override + public void reset() { + for (FieldVector v : getChildrenFromFields()) { + v.reset(); + } + valueCount = 0; + } + + /** + * Get information about how this field is materialized. + * + * @return the field corresponding to this vector + */ + @Override + public Field getField() { + return field; + } + + @Override + public MinorType getMinorType() { + return MinorType.RUNENDENCODED; + } + + /** + * To transfer quota responsibility. + * + * @param allocator the target allocator + * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new + * target vector of the same type. + */ + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + throw new UnsupportedOperationException( + "RunEndEncodedVector does not support getTransferPair(BufferAllocator)"); + } + + /** + * To transfer quota responsibility. + * + * @param ref the name of the vector + * @param allocator the target allocator + * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new + * target vector of the same type. + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return getTransferPair(ref, allocator, null); + } + + /** + * To transfer quota responsibility. + * + * @param field the Field object used by the target vector + * @param allocator the target allocator + * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new + * target vector of the same type. + */ + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator) { + return getTransferPair(field, allocator, null); + } + + /** + * To transfer quota responsibility. + * + * @param ref the name of the vector + * @param allocator the target allocator + * @param callBack A schema change callback. + * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new + * target vector of the same type. + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + throw new UnsupportedOperationException( + "RunEndEncodedVector does not support getTransferPair(String, BufferAllocator, CallBack)"); + } + + /** + * To transfer quota responsibility. + * + * @param field the Field object used by the target vector + * @param allocator the target allocator + * @param callBack A schema change callback. + * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new + * target vector of the same type. + */ + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) { + throw new UnsupportedOperationException( + "RunEndEncodedVector does not support getTransferPair(Field, BufferAllocator, CallBack)"); + } + + /** + * Makes a new transfer pair used to transfer underlying buffers. + * + * @param target the target for the transfer + * @return a new {@link org.apache.arrow.vector.util.TransferPair transfer pair} that is used to + * transfer underlying buffers into the target vector. + */ + @Override + public TransferPair makeTransferPair(ValueVector target) { + throw new UnsupportedOperationException( + "RunEndEncodedVector does not support makeTransferPair(ValueVector)"); + } + + /** + * Get a reader for this vector. + * + * @return a {@link org.apache.arrow.vector.complex.reader.FieldReader field reader} that supports + * reading values from this vector. + */ + @Override + public FieldReader getReader() { + throw new UnsupportedOperationException("Not yet implemented."); + } + + /** + * Get a writer for this vector. + * + * @return a {@link org.apache.arrow.vector.complex.writer.FieldWriter field writer} that supports + * writing values to this vector. + */ + public FieldWriter getWriter() { + throw new UnsupportedOperationException("Not yet implemented."); + } + + /** + * Get the number of bytes used by this vector. + * + * @return the number of bytes that is used by this vector instance. + */ + @Override + public int getBufferSize() { + int bufferSize = 0; + for (FieldVector v : getChildrenFromFields()) { + bufferSize += v.getBufferSize(); + } + return bufferSize; + } + + /** + * Returns the number of bytes that is used by this vector if it holds the given number of values. + * The result will be the same as if setValueCount() were called, followed by calling + * getBufferSize(), but without any of the closing side-effects that setValueCount() implies wrt + * finishing off the population of a vector. Some operations might wish to use this to determine + * how much memory has been used by a vector so far, even though it is not finished being + * populated. + * + * @param valueCount the number of values to assume this vector contains + * @return the buffer size if this vector is holding valueCount values + */ + @Override + public int getBufferSizeFor(int valueCount) { + return 0; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't impact the + * reference counts for this buffer so it only should be used for in-context access. Also note + * that this buffer changes regularly thus external classes shouldn't hold a reference to it + * (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted; + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + return new ArrowBuf[0]; + } + + /** + * Gets the underlying buffer associated with validity vector. + * + * @return buffer + */ + @Override + public ArrowBuf getValidityBuffer() { + throw new UnsupportedOperationException( + "Run-end encoded vectors do not have a validity buffer."); + } + + /** + * Gets the underlying buffer associated with data vector. + * + * @return buffer + */ + @Override + public ArrowBuf getDataBuffer() { + throw new UnsupportedOperationException("Run-end encoded vectors do not have a data buffer."); + } + + /** + * Gets the underlying buffer associated with offset vector. + * + * @return buffer + */ + @Override + public ArrowBuf getOffsetBuffer() { + throw new UnsupportedOperationException("Run-end encoded vectors do not have a offset buffer."); + } + + /** + * Gets the number of values. + * + * @return number of values in the vector + */ + @Override + public int getValueCount() { + return valueCount; + } + + /** Set number of values in the vector. */ + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + } + + /** + * Get friendly type object from the vector. + * + * @param index index of object to get + * @return friendly type object + */ + @Override + public Object getObject(int index) { + checkIndex(index); + int physicalIndex = getPhysicalIndex(index); + return valuesVector.getObject(physicalIndex); + } + + /** + * Get the run end of giving index. + * + * @param index index of the run end to get + * @return the run end of giving index + */ + public int getRunEnd(int index) { + checkIndex(index); + int physicalIndex = getPhysicalIndex(index); + return (int) ((BaseIntVector) runEndsVector).getValueAsLong(physicalIndex); + } + + /** + * Returns number of null elements in the vector. + * + * @return number of null elements + */ + @Override + public int getNullCount() { + // Null count is always 0 for run-end encoded array + return 0; + } + + /** + * Check whether an element in the vector is null. + * + * @param index index to check for null + * @return true if element is null + */ + @Override + public boolean isNull(int index) { + int physicalIndex = getPhysicalIndex(runEndsVector, index); + return valuesVector.isNull(physicalIndex); + } + + /** Returns hashCode of element in index with the default hasher. */ + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + /** Returns hashCode of element in index with the given hasher. */ + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + int hash = 0; + for (FieldVector v : getChildrenFromFields()) { + if (index < v.getValueCount()) { + hash = ByteFunctionHelpers.combineHash(hash, v.hashCode(index, hasher)); + } + } + return hash; + } + + /** + * Accept a generic {@link VectorVisitor} and return the result. + * + * @param the output result type. + * @param the input data together with visitor. + */ + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } + + /** + * Gets the name of the vector. + * + * @return the name of the vector. + */ + @Override + public String getName() { + return this.field.getName(); + } + + @Override + public Iterator iterator() { + return Collections.unmodifiableCollection(getChildrenFromFields()).iterator(); + } + + /** FieldVector interface */ + + /** + * Initializes the child vectors to be later loaded with loadBuffers. + * + * @param children the schema containing the run_ends column first and the values column second + */ + @Override + public void initializeChildrenFromFields(List children) { + checkArgument( + children.size() == 2, + "Run-end encoded vectors must have two child Fields. Found: %s", + children.isEmpty() ? "none" : children); + checkArgument( + Arrays.asList( + MinorType.SMALLINT.getType(), MinorType.INT.getType(), MinorType.BIGINT.getType()) + .contains(children.get(0).getType()), + "The first field represents the run-end vector and must be of type int " + + "with size 16, 32, or 64 bits. Found: %s", + children.get(0).getType()); + runEndsVector = (BaseIntVector) children.get(0).createVector(allocator); + valuesVector = children.get(1).createVector(allocator); + field = new Field(field.getName(), field.getFieldType(), children); + } + + /** + * The returned list is the same size as the list passed to initializeChildrenFromFields. + * + * @return the children according to schema (empty for primitive types) + */ + @Override + public List getChildrenFromFields() { + return Arrays.asList(runEndsVector, valuesVector); + } + + /** + * Loads data in the vectors. (ownBuffers must be the same size as getFieldVectors()) + * + * @param fieldNode the fieldNode + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (!ownBuffers.isEmpty()) { + throw new UnsupportedOperationException( + "Run-end encoded vectors do not have any associated buffers."); + } + } + + /** + * Get the buffers of the fields, (same size as getFieldVectors() since it is their content). + * + * @return the buffers containing the data for this vector (ready for reading) + */ + @Override + public List getFieldBuffers() { + return List.of(); + } + + /** + * Get the inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + * @deprecated This API will be removed as the current implementations no longer support inner + * vectors. + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers()."); + } + + /** + * Gets the starting address of the underlying buffer associated with validity vector. + * + * @return buffer address + */ + @Override + public long getValidityBufferAddress() { + throw new UnsupportedOperationException( + "Run-end encoded vectors do not have a validity buffer."); + } + + /** + * Gets the starting address of the underlying buffer associated with data vector. + * + * @return buffer address + */ + @Override + public long getDataBufferAddress() { + throw new UnsupportedOperationException("Run-end encoded vectors do not have a data buffer."); + } + + /** + * Gets the starting address of the underlying buffer associated with offset vector. + * + * @return buffer address + */ + @Override + public long getOffsetBufferAddress() { + throw new UnsupportedOperationException( + "Run-end encoded vectors do not have an offset buffer."); + } + + /** + * Set the element at the given index to null. + * + * @param index the value to change + */ + @Override + public void setNull(int index) { + throw new UnsupportedOperationException( + "Run-end encoded vectors do not have a validity buffer."); + } + + public FieldVector getRunEndsVector() { + return runEndsVector; + } + + public FieldVector getValuesVector() { + return valuesVector; + } + + private void checkIndex(int logicalIndex) { + if (logicalIndex < 0 || logicalIndex >= valueCount) { + throw new IndexOutOfBoundsException( + String.format("index: %s, expected range (0, %s)", logicalIndex, valueCount)); + } + } + + /** + * The physical index is the index of the first value that is larger than logical index. e.g. if + * run_ends is [1,2,3], the physical index of logical index from 0 to 5 is [0, 1, 1, 2, 2, 2] + */ + public int getPhysicalIndex(int logicalIndex) { + return getPhysicalIndex(runEndsVector, logicalIndex); + } + + static int getPhysicalIndex(FieldVector runEndVector, int logicalIndex) { + if (runEndVector == null || runEndVector.getValueCount() == 0) { + return -1; + } + + int low = 0; + int high = runEndVector.getValueCount() - 1; + int result = -1; + + while (low <= high) { + int mid = low + (high - low) / 2; + long valueAsLong = ((BaseIntVector) runEndVector).getValueAsLong(mid); + if (valueAsLong > logicalIndex) { + result = mid; + high = mid - 1; + } else { + low = mid + 1; + } + } + + return result; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java b/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java index f4f06dad2a424..ca56214fdac77 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java @@ -394,5 +394,10 @@ public FieldVector visit(ListView type) { public FieldVector visit(LargeListView type) { throw unsupported(type); } + + @Override + public FieldVector visit(RunEndEncoded type) { + throw unsupported(type); + } } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 6b2c56de01c40..e9b963b62c13b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -73,6 +73,7 @@ import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.complex.impl.BigIntWriterImpl; @@ -142,6 +143,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.ListView; import org.apache.arrow.vector.types.pojo.ArrowType.Map; import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded; import org.apache.arrow.vector.types.pojo.ArrowType.Struct; import org.apache.arrow.vector.types.pojo.ArrowType.Time; import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; @@ -786,6 +788,19 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { .getNewFieldWriter(vector); } }, + RUNENDENCODED(RunEndEncoded.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, BufferAllocator allocator, CallBack schemaChangeCallback) { + return new RunEndEncodedVector(field, allocator, schemaChangeCallback); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + throw new UnsupportedOperationException( + "FieldWriter for run-end encoded vector is not implemented yet."); + } + }, ; private final ArrowType type; @@ -1021,6 +1036,11 @@ public MinorType visit(LargeListView type) { public MinorType visit(ExtensionType type) { return MinorType.EXTENSIONTYPE; } + + @Override + public MinorType visit(RunEndEncoded type) { + return MinorType.RUNENDENCODED; + } }); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java index 0c9140c360d15..ef31b4f837344 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java @@ -20,6 +20,7 @@ import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseIntVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthViewVector; @@ -35,6 +36,7 @@ import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -287,4 +289,36 @@ public Void visit(ExtensionTypeVector vector, Void value) { vector.getUnderlyingVector().accept(this, value); return null; } + + @Override + public Void visit(RunEndEncodedVector vector, Void value) { + validateVectorCommon(vector); + int valueCount = vector.getValueCount(); + FieldVector runEndsVector = vector.getRunEndsVector(); + + if (runEndsVector != null) { + validateOrThrow( + runEndsVector.getNullCount() == 0, "Run ends vector cannot contain null values"); + runEndsVector.accept(this, null); + + int runCount = runEndsVector.getValueCount(); + if (runCount == 0) { + validateOrThrow(valueCount == 0, "Run end vector does not contain enough elements"); + } else if (runCount > 0) { + double lastEnd = ((BaseIntVector) runEndsVector).getValueAsLong(runCount - 1); + validateOrThrow( + valueCount == lastEnd, + "Vector logic length not equal to the last end in run ends vector. Logical length %s, last end %s", + valueCount, + lastEnd); + } + } + + FieldVector valuesVector = vector.getValuesVector(); + if (valuesVector != null) { + valuesVector.accept(this, null); + } + + return null; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java index f947dcf41342f..daad41dbdc2ce 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java @@ -67,6 +67,7 @@ import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; @@ -478,4 +479,15 @@ public Void visit(ExtensionTypeVector vector, Void value) { validateExtensionTypeVector(vector); return null; } + + @Override + public Void visit(RunEndEncodedVector vector, Void value) { + validateVectorCommon(vector, ArrowType.RunEndEncoded.class); + for (ValueVector subVector : vector.getChildrenFromFields()) { + if (subVector != null) { + subVector.accept(this, null); + } + } + return null; + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java new file mode 100644 index 0000000000000..3f4be2e52ce56 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; +import java.util.function.Function; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.compare.Range; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; +import org.apache.arrow.vector.complex.RunEndEncodedVector; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestRunEndEncodedVector { + + private BufferAllocator allocator; + + @BeforeEach + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @AfterEach + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testInitializeChildrenFromFields() { + final FieldType valueType = FieldType.notNullable(Types.MinorType.BIGINT.getType()); + final FieldType runEndType = FieldType.notNullable(Types.MinorType.INT.getType()); + final Field valueField = new Field("value", valueType, null); + final Field runEndField = new Field("ree", runEndType, null); + + try (RunEndEncodedVector reeVector = RunEndEncodedVector.empty("empty", allocator)) { + reeVector.initializeChildrenFromFields(List.of(runEndField, valueField)); + reeVector.validate(); + } + } + + /** Create REE vector with constant value. */ + @Test + public void testConstantValueVector() { + final Field runEndEncodedField = createBigIntRunEndEncodedField("constant"); + int logicalValueCount = 100; + + // constant vector + try (RunEndEncodedVector reeVector = + new RunEndEncodedVector(runEndEncodedField, allocator, null)) { + Long value = 65536L; + setConstantVector(reeVector, value, logicalValueCount); + assertEquals(logicalValueCount, reeVector.getValueCount()); + for (int i = 0; i < logicalValueCount; i++) { + assertEquals(value, reeVector.getObject(i)); + } + } + + // constant null vector + try (RunEndEncodedVector reeVector = + new RunEndEncodedVector(runEndEncodedField, allocator, null)) { + setConstantVector(reeVector, null, logicalValueCount); + assertEquals(logicalValueCount, reeVector.getValueCount()); + // Null count is always 0 for run-end encoded array + assertEquals(0, reeVector.getNullCount()); + for (int i = 0; i < logicalValueCount; i++) { + assertTrue(reeVector.isNull(i)); + assertNull(reeVector.getObject(i)); + } + } + } + + @Test + public void testBasicRunEndEncodedVector() { + try (RunEndEncodedVector reeVector = + new RunEndEncodedVector(createBigIntRunEndEncodedField("basic"), allocator, null)) { + + // Create REE vector representing: + // [null, 2, 2, null, null, null, 4, 4, 4, 4, null, null, null, null, null]. + int runCount = 5; + final int logicalValueCount = + setBasicVector(reeVector, runCount, i -> i % 2 == 0 ? null : i + 1, i -> i + 1); + + assertEquals(15, reeVector.getValueCount()); + int index = 0; + for (int run = 0; run < runCount; run++) { + long expectedRunValue = (long) run + 1; + for (int j = 0; j <= run; j++) { + if (run % 2 == 0) { + assertNull(reeVector.getObject(index)); + } else { + assertEquals(expectedRunValue, reeVector.getObject(index)); + } + index++; + } + } + + // test index out of bound + assertThrows(IndexOutOfBoundsException.class, () -> reeVector.getObject(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> reeVector.getObject(logicalValueCount)); + } + } + + @Test + public void testRangeCompare() { + // test compare same constant vector + RunEndEncodedVector constantVector = + new RunEndEncodedVector(createBigIntRunEndEncodedField("constant"), allocator, null); + int logicalValueCount = 15; + + setConstantVector(constantVector, 1L, logicalValueCount); + + assertTrue( + constantVector.accept( + new RangeEqualsVisitor(constantVector, constantVector), + new Range(0, 0, logicalValueCount))); + assertTrue( + constantVector.accept( + new RangeEqualsVisitor(constantVector, constantVector), new Range(1, 1, 14))); + assertTrue( + constantVector.accept( + new RangeEqualsVisitor(constantVector, constantVector), new Range(1, 2, 13))); + assertFalse( + constantVector.accept( + new RangeEqualsVisitor(constantVector, constantVector), new Range(1, 10, 10))); + assertFalse( + constantVector.accept( + new RangeEqualsVisitor(constantVector, constantVector), new Range(10, 1, 10))); + + // Create REE vector representing: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]. + RunEndEncodedVector reeVector = + new RunEndEncodedVector(createBigIntRunEndEncodedField("basic"), allocator, null); + setBasicVector(reeVector, 5, i -> i + 1, i -> i + 1); + + assertTrue( + reeVector.accept( + new RangeEqualsVisitor(reeVector, reeVector), new Range(0, 0, logicalValueCount))); + assertTrue( + reeVector.accept( + new RangeEqualsVisitor(reeVector, reeVector), new Range(2, 2, logicalValueCount - 2))); + assertFalse( + reeVector.accept( + new RangeEqualsVisitor(reeVector, reeVector), new Range(1, 2, logicalValueCount - 2))); + + assertFalse( + reeVector.accept( + new RangeEqualsVisitor(reeVector, constantVector), new Range(0, 0, logicalValueCount))); + + // Create REE vector representing: [2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]. + RunEndEncodedVector reeVector2 = + new RunEndEncodedVector(createBigIntRunEndEncodedField("basic"), allocator, null); + setBasicVector(reeVector2, 4, i -> i + 2, i -> i + 2); + + assertTrue( + reeVector.accept( + new RangeEqualsVisitor(reeVector, reeVector2), new Range(1, 0, logicalValueCount - 1))); + + constantVector.close(); + reeVector.close(); + reeVector2.close(); + } + + private static Field createBigIntRunEndEncodedField(String fieldName) { + final FieldType valueType = FieldType.notNullable(Types.MinorType.BIGINT.getType()); + final FieldType runEndType = FieldType.notNullable(Types.MinorType.INT.getType()); + + final Field valueField = new Field("value", valueType, null); + final Field runEndField = new Field("ree", runEndType, null); + + return new Field( + fieldName, FieldType.notNullable(RunEndEncoded.INSTANCE), List.of(runEndField, valueField)); + } + + private static void setConstantVector( + RunEndEncodedVector constantVector, Long value, long logicalValueCount) { + setBasicVector(constantVector, 1, i -> value, i -> logicalValueCount); + } + + private static int setBasicVector( + RunEndEncodedVector reeVector, + int runCount, + Function runValueSupplier, + Function runLengthSupplier) { + reeVector.allocateNew(); + reeVector.setInitialCapacity(runCount); + int end = 0; + for (int i = 0; i < runCount; i++) { + Long runValue = runValueSupplier.apply((long) i); + if (runValue == null) { + reeVector.getValuesVector().setNull(i); + } else { + ((BigIntVector) reeVector.getValuesVector()).set(i, runValue); + } + + Long runLength = runLengthSupplier.apply((long) i); + assert runLength != null && runLength > 0; + end += runLength; + ((IntVector) reeVector.getRunEndsVector()).set(i, end); + } + + final int logicalValueCount = end; + reeVector.getValuesVector().setValueCount(runCount); + reeVector.getRunEndsVector().setValueCount(runCount); + reeVector.setValueCount(logicalValueCount); + return logicalValueCount; + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java b/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java index 60c4c3a9bc6d2..35c15bdf538f3 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java @@ -23,6 +23,7 @@ import java.nio.charset.Charset; import java.util.Arrays; +import java.util.List; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; @@ -33,6 +34,7 @@ import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.complex.impl.NullableStructWriter; @@ -40,6 +42,7 @@ import org.apache.arrow.vector.holders.NullableFloat8Holder; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.junit.jupiter.api.AfterEach; @@ -265,6 +268,41 @@ public void testBaseFixedWidthVectorInstanceMethod() { } } + @Test + public void testRunEndEncodedVector() { + final FieldType valueType = FieldType.notNullable(Types.MinorType.BIGINT.getType()); + final FieldType runEndType = FieldType.notNullable(Types.MinorType.INT.getType()); + + final Field valueField = new Field("value", valueType, null); + final Field runEndField = new Field("ree", runEndType, null); + + try (RunEndEncodedVector vector = + new RunEndEncodedVector( + new Field( + "ree", + FieldType.notNullable(RunEndEncoded.INSTANCE), + List.of(runEndField, valueField)), + allocator, + null)) { + vector.validate(); + + int runCount = 1; + vector.allocateNew(); + ((BigIntVector) vector.getValuesVector()).set(0, 1); + ((IntVector) vector.getRunEndsVector()).set(0, 10); + vector.getValuesVector().setValueCount(runCount); + vector.getRunEndsVector().setValueCount(runCount); + vector.setValueCount(10); + + vector.validate(); + + vector.getRunEndsVector().setValueCount(0); + ValidateUtil.ValidateException e = + assertThrows(ValidateUtil.ValidateException.class, () -> vector.validate()); + assertTrue(e.getMessage().contains("Run end vector does not contain enough elements")); + } + } + private void writeStructVector(NullableStructWriter writer, int value1, long value2) { writer.start(); writer.integer("f0").writeInt(value1);