-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
…#43269) ### Rationale for this change See #43266. Note that LargeBinary and LargeString are still limited to 2 GiB buffers, and LargeList is limited to offsets that can be represented as int32. ### What changes are included in this PR? * Add new Array subtypes: LargeBinaryArray, LargeStringArray and LargeListArray * Support round-tripping these array types via the IPC format * Support round-tripping these array types via the C Data Interface * Improve error messages when importing arrays that are too large via IPC or C Data Interface * Enable integration tests for the new types * Update documentation ### Are these changes tested? Yes, I've added some basic tests specifically for the new array types, and added these to the test data generator so they're covered by the existing tests for round tripping using IPC and C Data Interface. ### Are there any user-facing changes? Yes, this is a new user facing feature. ### Implementation notes * I haven't added builders for these new array types. Given they're added to help with interoperability with other libraries, I wouldn't expect .NET users to build arrays of these types as they provide no other benefit over the non-large types until we have proper large memory support. But I'm happy to add this if it would be useful. * The new array types share a lot of logic with the non-large types. I considered trying to consolidate this logic by adding a new `BinaryArrayBase<TOffset>` class for example, but I think this would require generic math support to work nicely, and would still complicate the code quite a bit and add extra virtual method call overhead. So I think it's fine to keep these new Array subtypes independent from the non-large types. * I haven't included support for materializing a LargeStringArray (see #41048). I'm not sure whether there would be a use for this, but it could be added later if needed. * GitHub Issue: #43266 Authored-by: Adam Reeve <[email protected]> Signed-off-by: Curt Hagenlocher <[email protected]>
- Loading branch information
Showing
26 changed files
with
1,254 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one or more | ||
// contributor license agreements. See the NOTICE file distributed with | ||
// this work for additional information regarding copyright ownership. | ||
// The ASF licenses this file to You under the Apache License, Version 2.0 | ||
// (the "License"); you may not use this file except in compliance with | ||
// the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
using Apache.Arrow.Types; | ||
using System; | ||
using System.Collections; | ||
using System.Collections.Generic; | ||
using System.Runtime.CompilerServices; | ||
|
||
namespace Apache.Arrow; | ||
|
||
public class LargeBinaryArray : Array, IReadOnlyList<byte[]>, ICollection<byte[]> | ||
{ | ||
public LargeBinaryArray(ArrayData data) | ||
: base(data) | ||
{ | ||
data.EnsureDataType(ArrowTypeId.LargeBinary); | ||
data.EnsureBufferCount(3); | ||
} | ||
|
||
public LargeBinaryArray(ArrowTypeId typeId, ArrayData data) | ||
: base(data) | ||
{ | ||
data.EnsureDataType(typeId); | ||
data.EnsureBufferCount(3); | ||
} | ||
|
||
public LargeBinaryArray(IArrowType dataType, int length, | ||
ArrowBuffer valueOffsetsBuffer, | ||
ArrowBuffer dataBuffer, | ||
ArrowBuffer nullBitmapBuffer, | ||
int nullCount = 0, int offset = 0) | ||
: this(new ArrayData(dataType, length, nullCount, offset, | ||
new[] { nullBitmapBuffer, valueOffsetsBuffer, dataBuffer })) | ||
{ } | ||
|
||
public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); | ||
|
||
public ArrowBuffer ValueOffsetsBuffer => Data.Buffers[1]; | ||
|
||
public ArrowBuffer ValueBuffer => Data.Buffers[2]; | ||
|
||
public ReadOnlySpan<long> ValueOffsets => ValueOffsetsBuffer.Span.CastTo<long>().Slice(Offset, Length + 1); | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public int GetValueLength(int index) | ||
{ | ||
if (index < 0 || index >= Length) | ||
{ | ||
throw new ArgumentOutOfRangeException(nameof(index)); | ||
} | ||
if (!IsValid(index)) | ||
{ | ||
return 0; | ||
} | ||
|
||
ReadOnlySpan<long> offsets = ValueOffsets; | ||
return checked((int)(offsets[index + 1] - offsets[index])); | ||
} | ||
|
||
/// <summary> | ||
/// Get the collection of bytes, as a read-only span, at a given index in the array. | ||
/// </summary> | ||
/// <remarks> | ||
/// Note that this method cannot reliably identify null values, which are indistinguishable from empty byte | ||
/// collection values when seen in the context of this method's return type of <see cref="ReadOnlySpan{Byte}"/>. | ||
/// Use the <see cref="Array.IsNull"/> method or the <see cref="GetBytes(int, out bool)"/> overload instead | ||
/// to reliably determine null values. | ||
/// </remarks> | ||
/// <param name="index">Index at which to get bytes.</param> | ||
/// <returns>Returns a <see cref="ReadOnlySpan{Byte}"/> object.</returns> | ||
/// <exception cref="ArgumentOutOfRangeException">If the index is negative or beyond the length of the array. | ||
/// </exception> | ||
public ReadOnlySpan<byte> GetBytes(int index) => GetBytes(index, out _); | ||
|
||
/// <summary> | ||
/// Get the collection of bytes, as a read-only span, at a given index in the array. | ||
/// </summary> | ||
/// <param name="index">Index at which to get bytes.</param> | ||
/// <param name="isNull">Set to <see langword="true"/> if the value at the given index is null.</param> | ||
/// <returns>Returns a <see cref="ReadOnlySpan{Byte}"/> object.</returns> | ||
/// <exception cref="ArgumentOutOfRangeException">If the index is negative or beyond the length of the array. | ||
/// </exception> | ||
public ReadOnlySpan<byte> GetBytes(int index, out bool isNull) | ||
{ | ||
if (index < 0 || index >= Length) | ||
{ | ||
throw new ArgumentOutOfRangeException(nameof(index)); | ||
} | ||
|
||
isNull = IsNull(index); | ||
|
||
if (isNull) | ||
{ | ||
// Note that `return null;` is valid syntax, but would be misleading as `null` in the context of a span | ||
// is actually returned as an empty span. | ||
return ReadOnlySpan<byte>.Empty; | ||
} | ||
|
||
var offset = checked((int)ValueOffsets[index]); | ||
return ValueBuffer.Span.Slice(offset, GetValueLength(index)); | ||
} | ||
|
||
int IReadOnlyCollection<byte[]>.Count => Length; | ||
|
||
byte[] IReadOnlyList<byte[]>.this[int index] => GetBytes(index).ToArray(); | ||
|
||
IEnumerator<byte[]> IEnumerable<byte[]>.GetEnumerator() | ||
{ | ||
for (int index = 0; index < Length; index++) | ||
{ | ||
yield return GetBytes(index).ToArray(); | ||
} | ||
} | ||
|
||
IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable<byte[]>)this).GetEnumerator(); | ||
|
||
int ICollection<byte[]>.Count => Length; | ||
bool ICollection<byte[]>.IsReadOnly => true; | ||
void ICollection<byte[]>.Add(byte[] item) => throw new NotSupportedException("Collection is read-only."); | ||
bool ICollection<byte[]>.Remove(byte[] item) => throw new NotSupportedException("Collection is read-only."); | ||
void ICollection<byte[]>.Clear() => throw new NotSupportedException("Collection is read-only."); | ||
|
||
bool ICollection<byte[]>.Contains(byte[] item) | ||
{ | ||
for (int index = 0; index < Length; index++) | ||
{ | ||
if (GetBytes(index).SequenceEqual(item)) | ||
return true; | ||
} | ||
|
||
return false; | ||
} | ||
|
||
void ICollection<byte[]>.CopyTo(byte[][] array, int arrayIndex) | ||
{ | ||
for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) | ||
{ | ||
array[destIndex] = GetBytes(srcIndex).ToArray(); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one or more | ||
// contributor license agreements. See the NOTICE file distributed with | ||
// this work for additional information regarding copyright ownership. | ||
// The ASF licenses this file to You under the Apache License, Version 2.0 | ||
// (the "License"); you may not use this file except in compliance with | ||
// the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
using System; | ||
using Apache.Arrow.Types; | ||
|
||
namespace Apache.Arrow | ||
{ | ||
public class LargeListArray : Array | ||
{ | ||
public IArrowArray Values { get; } | ||
|
||
public ArrowBuffer ValueOffsetsBuffer => Data.Buffers[1]; | ||
|
||
public ReadOnlySpan<long> ValueOffsets => ValueOffsetsBuffer.Span.CastTo<long>().Slice(Offset, Length + 1); | ||
|
||
public LargeListArray(IArrowType dataType, int length, | ||
ArrowBuffer valueOffsetsBuffer, IArrowArray values, | ||
ArrowBuffer nullBitmapBuffer, int nullCount = 0, int offset = 0) | ||
: this(new ArrayData(dataType, length, nullCount, offset, | ||
new[] { nullBitmapBuffer, valueOffsetsBuffer }, new[] { values.Data }), | ||
values) | ||
{ | ||
} | ||
|
||
public LargeListArray(ArrayData data) | ||
: this(data, ArrowArrayFactory.BuildArray(data.Children[0])) | ||
{ | ||
} | ||
|
||
private LargeListArray(ArrayData data, IArrowArray values) : base(data) | ||
{ | ||
data.EnsureBufferCount(2); | ||
data.EnsureDataType(ArrowTypeId.LargeList); | ||
Values = values; | ||
} | ||
|
||
public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); | ||
|
||
public int GetValueLength(int index) | ||
{ | ||
if (index < 0 || index >= Length) | ||
{ | ||
throw new ArgumentOutOfRangeException(nameof(index)); | ||
} | ||
|
||
if (IsNull(index)) | ||
{ | ||
return 0; | ||
} | ||
|
||
ReadOnlySpan<long> offsets = ValueOffsets; | ||
return checked((int)(offsets[index + 1] - offsets[index])); | ||
} | ||
|
||
public IArrowArray GetSlicedValues(int index) | ||
{ | ||
if (index < 0 || index >= Length) | ||
{ | ||
throw new ArgumentOutOfRangeException(nameof(index)); | ||
} | ||
|
||
if (IsNull(index)) | ||
{ | ||
return null; | ||
} | ||
|
||
if (!(Values is Array array)) | ||
{ | ||
return default; | ||
} | ||
|
||
return array.Slice(checked((int)ValueOffsets[index]), GetValueLength(index)); | ||
} | ||
|
||
protected override void Dispose(bool disposing) | ||
{ | ||
if (disposing) | ||
{ | ||
Values?.Dispose(); | ||
} | ||
base.Dispose(disposing); | ||
} | ||
} | ||
} |
Oops, something went wrong.