Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-36795: [C#] Implement support for dense and sparse unions #36797

Merged
merged 21 commits into from
Sep 25, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
dc253c7
Initial changes for Union support
CurtHagenlocher Jul 20, 2023
1bb1417
Union-related fixes
CurtHagenlocher Jul 20, 2023
56df33a
Merge branch 'apache:main' into Union
CurtHagenlocher Jul 20, 2023
6b3ff65
Fixed IPC to work correctly for unions
CurtHagenlocher Jul 20, 2023
01eb95b
Merge branch 'Union' of https://github.com/CurtHagenlocher/arrow into…
CurtHagenlocher Jul 20, 2023
c576eca
Implement Archery support for C# unions
CurtHagenlocher Jul 20, 2023
a113a80
Better backwards compatibility
CurtHagenlocher Jul 21, 2023
bfc26ee
Merge branch 'main' of https://github.com/apache/arrow into Union
CurtHagenlocher Jul 23, 2023
2bd6f3d
Merge branch 'main' of https://github.com/apache/arrow into Union
CurtHagenlocher Jul 24, 2023
f892aba
Merge from main
CurtHagenlocher Aug 21, 2023
b77d04c
Merge from main and resolve conflicts
CurtHagenlocher Aug 21, 2023
039ec9e
Fix deserialization of fixed-size list
CurtHagenlocher Aug 21, 2023
0327be2
Fix bug in ctor
CurtHagenlocher Aug 25, 2023
832f7cb
PR feedback
CurtHagenlocher Sep 6, 2023
6591b81
Merge branch 'main' of https://github.com/CurtHagenlocher/arrow into …
CurtHagenlocher Sep 6, 2023
0de1117
Merge from main
CurtHagenlocher Sep 10, 2023
a29e1c1
Merge branch 'main' of https://github.com/CurtHagenlocher/arrow into …
CurtHagenlocher Sep 19, 2023
5ebefcf
Increment metadata version to V5 and add handling for V4 unions.
CurtHagenlocher Sep 22, 2023
ddac4d3
Merge branch 'main' into Union
CurtHagenlocher Sep 22, 2023
143a469
Correctly skip buffer in pre-V5 metadata
CurtHagenlocher Sep 23, 2023
3ef09de
Merge branch 'Union' of https://github.com/CurtHagenlocher/arrow into…
CurtHagenlocher Sep 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 2 additions & 11 deletions csharp/src/Apache.Arrow/Arrays/Array.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,7 @@ internal static void Accept<T>(T array, IArrowArrayVisitor visitor)

public Array Slice(int offset, int length)
{
if (offset > Length)
{
throw new ArgumentException($"Offset {offset} cannot be greater than Length {Length} for Array.Slice");
}

length = Math.Min(Data.Length - offset, length);
offset += Data.Offset;

ArrayData newData = Data.Slice(offset, length);
return ArrowArrayFactory.BuildArray(newData) as Array;
return ArrowArrayFactory.Slice(this, offset, length) as Array;
}

public void Dispose()
Expand All @@ -88,4 +79,4 @@ protected virtual void Dispose(bool disposing)
}
}
}
}
}
62 changes: 61 additions & 1 deletion csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ private class ArrayDataConcatenationVisitor :
IArrowTypeVisitor<StringType>,
IArrowTypeVisitor<ListType>,
IArrowTypeVisitor<FixedSizeListType>,
IArrowTypeVisitor<StructType>
IArrowTypeVisitor<StructType>,
IArrowTypeVisitor<UnionType>
{
public ArrayData Result { get; private set; }
private readonly IReadOnlyList<ArrayData> _arrayDataList;
Expand Down Expand Up @@ -123,6 +124,33 @@ public void Visit(StructType type)
Result = new ArrayData(type, _arrayDataList[0].Length, _arrayDataList[0].NullCount, 0, _arrayDataList[0].Buffers, children);
}

public void Visit(UnionType type)
{
int bufferCount = type.Mode switch
{
UnionMode.Sparse => 1,
UnionMode.Dense => 2,
_ => throw new InvalidOperationException("TODO"),
};

CheckData(type, bufferCount);
List<ArrayData> children = new List<ArrayData>(type.Fields.Count);

for (int i = 0; i < type.Fields.Count; i++)
{
children.Add(Concatenate(SelectChildren(i), _allocator));
}

ArrowBuffer[] buffers = new ArrowBuffer[bufferCount];
buffers[0] = ConcatenateUnionTypeBuffer();
if (bufferCount > 1)
{
buffers[1] = ConcatenateUnionOffsetBuffer();
}

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, buffers, children);
}

public void Visit(IArrowType type)
{
throw new NotImplementedException($"Concatenation for {type.Name} is not supported yet.");
Expand Down Expand Up @@ -231,6 +259,38 @@ private ArrowBuffer ConcatenateOffsetBuffer()
return builder.Build(_allocator);
}

private ArrowBuffer ConcatenateUnionTypeBuffer()
{
var builder = new ArrowBuffer.Builder<byte>(_totalLength);

foreach (ArrayData arrayData in _arrayDataList)
{
builder.Append(arrayData.Buffers[0]);
}

return builder.Build(_allocator);
}

private ArrowBuffer ConcatenateUnionOffsetBuffer()
{
var builder = new ArrowBuffer.Builder<int>(_totalLength);
int baseOffset = 0;

foreach (ArrayData arrayData in _arrayDataList)
{
ReadOnlySpan<int> span = arrayData.Buffers[1].Span.CastTo<int>();
foreach (int offset in span)
{
builder.Append(baseOffset + offset);
}

// The next offset must start from the current last offset.
baseOffset += span[arrayData.Length];
}

return builder.Build(_allocator);
}

private List<ArrayData> SelectChildren(int index)
{
var children = new List<ArrayData>(_arrayDataList.Count);
Expand Down
12 changes: 11 additions & 1 deletion csharp/src/Apache.Arrow/Arrays/ArrayDataTypeComparer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ internal sealed class ArrayDataTypeComparer :
IArrowTypeVisitor<FixedSizeBinaryType>,
IArrowTypeVisitor<ListType>,
IArrowTypeVisitor<FixedSizeListType>,
IArrowTypeVisitor<StructType>
IArrowTypeVisitor<StructType>,
IArrowTypeVisitor<UnionType>
{
private readonly IArrowType _expectedType;
private bool _dataTypeMatch;
Expand Down Expand Up @@ -122,6 +123,15 @@ public void Visit(StructType actualType)
}
}

public void Visit(UnionType actualType)
{
if (_expectedType is UnionType expectedType
&& CompareNested(expectedType, actualType))
{
_dataTypeMatch = true;
}
}

private static bool CompareNested(NestedType expectedType, NestedType actualType)
{
if (expectedType.Fields.Count != actualType.Fields.Count)
Expand Down
16 changes: 15 additions & 1 deletion csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public static IArrowArray BuildArray(ArrayData data)
case ArrowTypeId.Struct:
return new StructArray(data);
case ArrowTypeId.Union:
return new UnionArray(data);
return UnionArray.Create(data);
case ArrowTypeId.Date64:
return new Date64Array(data);
case ArrowTypeId.Date32:
Expand Down Expand Up @@ -91,5 +91,19 @@ public static IArrowArray BuildArray(ArrayData data)
throw new NotSupportedException($"An ArrowArray cannot be built for type {data.DataType.TypeId}.");
}
}

public static IArrowArray Slice(IArrowArray array, int offset, int length)
{
if (offset > array.Length)
{
throw new ArgumentException($"Offset {offset} cannot be greater than Length {array.Length} for Array.Slice");
}

length = Math.Min(array.Data.Length - offset, length);
offset += array.Data.Offset;

ArrayData newData = array.Data.Slice(offset, length);
return BuildArray(newData);
}
}
}
52 changes: 52 additions & 0 deletions csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

using Apache.Arrow.Types;
using System;
using System.Collections.Generic;
using System.Linq;

namespace Apache.Arrow
{
public class DenseUnionArray : UnionArray
{
public ArrowBuffer ValueOffsetBuffer => Data.Buffers[1];

public ReadOnlySpan<int> ValueOffsets => ValueOffsetBuffer.Span.CastTo<int>();

public DenseUnionArray(
IArrowType dataType,
int length,
IEnumerable<IArrowArray> children,
ArrowBuffer typeIds,
ArrowBuffer valuesOffsetBuffer,
int nullCount = 0,
int offset = 0)
: base(new ArrayData(
dataType, length, nullCount, offset, new[] { typeIds, valuesOffsetBuffer },
children.Select(child => child.Data)))
{
_fields = children.ToArray();
ValidateMode(UnionMode.Dense, Type.Mode);
}

public DenseUnionArray(ArrayData data)
: base(data)
{
ValidateMode(UnionMode.Dense, Type.Mode);
data.EnsureBufferCount(2); // TODO:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe link this TODO to a new issue? (and what is the TODO about, given SparseUnionArray lacks one?)

Copy link
Contributor Author

@CurtHagenlocher CurtHagenlocher Sep 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Heck if I can remember... . Will just remove the TODO.

}
}
}
3 changes: 3 additions & 0 deletions csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ public TBuilder Append(T value)
return Instance;
}

public TBuilder Append(T? value) =>
(value == null) ? AppendNull() : Append(value.Value);

public TBuilder Append(ReadOnlySpan<T> span)
{
int len = ValueBuffer.Length;
Expand Down
46 changes: 46 additions & 0 deletions csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

using Apache.Arrow.Types;
using System.Collections.Generic;
using System.Linq;

namespace Apache.Arrow
{
public class SparseUnionArray : UnionArray
{
public SparseUnionArray(
IArrowType dataType,
int length,
IEnumerable<IArrowArray> children,
ArrowBuffer typeIds,
int nullCount = 0,
int offset = 0)
: base(new ArrayData(
dataType, length, nullCount, offset, new[] { typeIds },
children.Select(child => child.Data)))
{
_fields = children.ToArray();
ValidateMode(UnionMode.Sparse, Type.Mode);
}

public SparseUnionArray(ArrayData data)
: base(data)
{
ValidateMode(UnionMode.Sparse, Type.Mode);
data.EnsureBufferCount(1);
}
}
}
77 changes: 64 additions & 13 deletions csharp/src/Apache.Arrow/Arrays/UnionArray.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,88 @@

using Apache.Arrow.Types;
using System;
using System.Collections.Generic;
using System.Threading;

namespace Apache.Arrow
{
public class UnionArray: Array
public abstract class UnionArray : IArrowArray
{
public UnionType Type => Data.DataType as UnionType;
protected IReadOnlyList<IArrowArray> _fields;

public UnionMode Mode => Type.Mode;
public IReadOnlyList<IArrowArray> Fields =>
LazyInitializer.EnsureInitialized(ref _fields, () => InitializeFields());

public ArrayData Data { get; }

public ArrowBuffer TypeBuffer => Data.Buffers[1];
public UnionType Type => (UnionType)Data.DataType;

public ArrowBuffer ValueOffsetBuffer => Data.Buffers[2];
public UnionMode Mode => Type.Mode;

public ArrowBuffer TypeBuffer => Data.Buffers[0];

public ReadOnlySpan<byte> TypeIds => TypeBuffer.Span;

public ReadOnlySpan<int> ValueOffsets => ValueOffsetBuffer.Span.CastTo<int>().Slice(0, Length + 1);
public int Length => Data.Length;

public int Offset => Data.Offset;

public UnionArray(ArrayData data)
: base(data)
public int NullCount => Data.NullCount;

public bool IsValid(int index) => NullCount == 0 || Fields[TypeIds[index]].IsValid(index);

public bool IsNull(int index) => !IsValid(index);

protected UnionArray(ArrayData data)
{
Data = data;
data.EnsureDataType(ArrowTypeId.Union);
data.EnsureBufferCount(3);
}

public IArrowArray GetChild(int index)
public static UnionArray Create(ArrayData data)
{
// TODO: Implement
throw new NotImplementedException();
return ((UnionType)data.DataType).Mode switch
{
UnionMode.Dense => new DenseUnionArray(data),
UnionMode.Sparse => new SparseUnionArray(data),
_ => throw new InvalidOperationException("unknown union mode in array creation")
};
}

public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor);
public void Accept(IArrowArrayVisitor visitor) => Array.Accept(this, visitor);

public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}

protected virtual void Dispose(bool disposing)
{
if (disposing)
{
Data.Dispose();
}
}

protected static void ValidateMode(UnionMode expected, UnionMode actual)
{
if (expected != actual)
{
throw new ArgumentException(
$"Specified union mode <{actual}> does not match expected mode <{expected}>",
"Mode");
}
}

private IReadOnlyList<IArrowArray> InitializeFields()
{
IArrowArray[] result = new IArrowArray[Data.Children.Length];
for (int i = 0; i < Data.Children.Length; i++)
{
result[i] = ArrowArrayFactory.BuildArray(Data.Children[i]);
}
return result;
}
}
}
Loading