Skip to content

Commit

Permalink
GH-43267: [C#] Correctly import sliced arrays through the C Data inte…
Browse files Browse the repository at this point in the history
…rface (#44117)

### What changes are included in this PR?

Changes to the C Data importer to correctly handle nonzero offsets.

### Are these changes tested?

Yes

### Are there any user-facing changes?

No

Closes #43267 
* GitHub Issue: #43267

Authored-by: Curt Hagenlocher <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
  • Loading branch information
CurtHagenlocher authored Sep 16, 2024
1 parent a5d40d0 commit 9ba789d
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 18 deletions.
6 changes: 2 additions & 4 deletions csharp/src/Apache.Arrow/Apache.Arrow.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,16 @@
<Description>Apache Arrow is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware.</Description>
</PropertyGroup>

<PropertyGroup Condition="'$(IsWindows)'=='true'">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net6.0;net8.0;net462</TargetFrameworks>
</PropertyGroup>
<PropertyGroup Condition="'$(IsWindows)'!='true'">
<TargetFrameworks>netstandard2.0;net6.0;net8.0</TargetFrameworks>
</PropertyGroup>

<ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETStandard' or '$(TargetFramework)' == 'net462'">
<PackageReference Include="System.Buffers" Version="4.5.1" />
<PackageReference Include="System.Memory" Version="4.5.5" />
<PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.7.1" />
<PackageReference Include="System.Threading.Tasks.Extensions" Version="4.5.4" />
<PackageReference Include="System.ValueTuple" Version="4.5.0" />
</ItemGroup>

<ItemGroup>
Expand Down
22 changes: 11 additions & 11 deletions csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ private ArrayData[] ProcessStructChildren(CArrowArray* cArray, IReadOnlyList<Fie

private ArrowBuffer ImportValidityBuffer(CArrowArray* cArray)
{
int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int validityLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8);
return (cArray->buffers[0] == null) ? ArrowBuffer.Empty : new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, validityLength));
}
Expand All @@ -285,7 +285,7 @@ private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray)
throw new InvalidOperationException("Byte arrays are expected to have exactly three buffers");
}

int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = (length + 1) * 4;
int* offsets = (int*)cArray->buffers[1];
Debug.Assert(offsets != null);
Expand All @@ -306,7 +306,7 @@ private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray)
throw new InvalidOperationException("Byte array views are expected to have at least three buffers");
}

int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int viewsLength = length * 16;

long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1];
Expand Down Expand Up @@ -336,7 +336,7 @@ private ArrowBuffer[] ImportLargeByteArrayBuffers(CArrowArray* cArray)
$"is greater than the maximum supported large byte array length ({maxLength})");
}

int length = (int)cArray->length;
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = (length + 1) * 8;
long* offsets = (long*)cArray->buffers[1];
Debug.Assert(offsets != null);
Expand Down Expand Up @@ -364,7 +364,7 @@ private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray)
throw new InvalidOperationException("List arrays are expected to have exactly two buffers");
}

int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = (length + 1) * 4;

ArrowBuffer[] buffers = new ArrowBuffer[2];
Expand All @@ -381,7 +381,7 @@ private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray)
throw new InvalidOperationException("List view arrays are expected to have exactly three buffers");
}

int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = length * 4;

ArrowBuffer[] buffers = new ArrowBuffer[3];
Expand All @@ -407,7 +407,7 @@ private ArrowBuffer[] ImportLargeListBuffers(CArrowArray* cArray)
$"is greater than the maximum supported large list array length ({maxLength})");
}

int length = (int)cArray->length;
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = (length + 1) * 8;

ArrowBuffer[] buffers = new ArrowBuffer[2];
Expand Down Expand Up @@ -436,7 +436,7 @@ private ArrowBuffer[] ImportDenseUnionBuffers(CArrowArray* cArray)
{
throw new InvalidOperationException("Dense union arrays are expected to have exactly two children");
}
int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int offsetsLength = length * 4;

ArrowBuffer[] buffers = new ArrowBuffer[2];
Expand All @@ -454,7 +454,7 @@ private ArrowBuffer[] ImportSparseUnionBuffers(CArrowArray* cArray)
}

ArrowBuffer[] buffers = new ArrowBuffer[1];
buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->length));
buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->offset + (int)cArray->length));

return buffers;
}
Expand All @@ -467,10 +467,10 @@ private ArrowBuffer[] ImportFixedWidthBuffers(CArrowArray* cArray, int bitWidth)
}

// validity, data
int length = checked((int)cArray->length);
int length = checked((int)cArray->offset + (int)cArray->length);
int valuesLength;
if (bitWidth >= 8)
valuesLength = checked((int)(cArray->length * bitWidth / 8));
valuesLength = checked(length * bitWidth / 8);
else
valuesLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8);

Expand Down
11 changes: 11 additions & 0 deletions csharp/src/Apache.Arrow/RecordBatch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,17 @@ public RecordBatch Clone(MemoryAllocator allocator = default)
return new RecordBatch(Schema, arrays, Length);
}

public RecordBatch Slice(int offset, int length)
{
if (offset > Length)
{
throw new ArgumentException($"Offset {offset} cannot be greater than Length {Length} for RecordBatch.Slice");
}

length = Math.Min(Length - offset, length);
return new RecordBatch(Schema, _arrays.Select(a => ArrowArrayFactory.Slice(a, offset, length)), length);
}

public void Accept(IArrowArrayVisitor visitor)
{
switch (visitor)
Expand Down
2 changes: 0 additions & 2 deletions csharp/src/Apache.Arrow/Utility.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.

using Apache.Arrow.Flatbuf;
using System;
using System.Collections.Generic;
using System.Text;

namespace Apache.Arrow
{
Expand Down
4 changes: 3 additions & 1 deletion csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,9 @@ private void CompareArrays(FixedSizeListArray actualArray)
var listSize = ((FixedSizeListType)expectedArray.Data.DataType).ListSize;
var expectedValuesSlice = ArrowArrayFactory.Slice(
expectedArray.Values, expectedArray.Offset * listSize, expectedArray.Length * listSize);
actualArray.Values.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare));
var actualValuesSlice = ArrowArrayFactory.Slice(
actualArray.Values, actualArray.Offset * listSize, actualArray.Length * listSize);
actualValuesSlice.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare));
}

private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer expectedValidityBuffer, int expectedBufferOffset, ArrowBuffer actualValidityBuffer, int actualBufferOffset)
Expand Down
18 changes: 18 additions & 0 deletions csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -92,5 +92,23 @@ public unsafe void CallsReleaseForInvalid()
GC.KeepAlive(releaseCallback);
}
#endif

[Fact]
public unsafe void RoundTripInt32ArrayWithOffset()
{
Int32Array array = new Int32Array.Builder()
.AppendRange(new[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 })
.Build();
IArrowArray sliced = array.Slice(2, 6);
CArrowArray* cArray = CArrowArray.Create();
CArrowArrayExporter.ExportArray(sliced, cArray);
using (var importedSlice = (Int32Array)CArrowArrayImporter.ImportArray(cArray, array.Data.DataType))
{
Assert.Equal(6, importedSlice.Length);
Assert.Equal(2, importedSlice.Offset);
Assert.Equal(2, importedSlice.GetValue(0));
}
CArrowArray.Free(cArray);
}
}
}
43 changes: 43 additions & 0 deletions csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,49 @@ public unsafe void RoundTripTestBatch()
CArrowSchema.Free(cImportSchema);
}

[SkippableFact]
public unsafe void RoundTripTestSlicedBatch()
{
// TODO: Enable these once this the version of pyarrow referenced during testing supports them
HashSet<ArrowTypeId> unsupported = new HashSet<ArrowTypeId> { ArrowTypeId.ListView, ArrowTypeId.BinaryView, ArrowTypeId.StringView };
RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, excludedTypes: unsupported);
RecordBatch batch1slice = batch1.Slice(1, 2);
RecordBatch batch2 = batch1slice.Clone();

CArrowArray* cExportArray = CArrowArray.Create();
CArrowArrayExporter.ExportRecordBatch(batch1slice, cExportArray);

CArrowSchema* cExportSchema = CArrowSchema.Create();
CArrowSchemaExporter.ExportSchema(batch1.Schema, cExportSchema);

CArrowArray* cImportArray = CArrowArray.Create();
CArrowSchema* cImportSchema = CArrowSchema.Create();

// For Python, we need to provide the pointers
long exportArrayPtr = ((IntPtr)cExportArray).ToInt64();
long exportSchemaPtr = ((IntPtr)cExportSchema).ToInt64();
long importArrayPtr = ((IntPtr)cImportArray).ToInt64();
long importSchemaPtr = ((IntPtr)cImportSchema).ToInt64();

using (Py.GIL())
{
dynamic pa = Py.Import("pyarrow");
dynamic exportedPyArray = pa.RecordBatch._import_from_c(exportArrayPtr, exportSchemaPtr);
exportedPyArray._export_to_c(importArrayPtr, importSchemaPtr);
}

Schema schema = CArrowSchemaImporter.ImportSchema(cImportSchema);
RecordBatch importedBatch = CArrowArrayImporter.ImportRecordBatch(cImportArray, schema);

ArrowReaderVerifier.CompareBatches(batch2, importedBatch, strictCompare: false); // Non-strict because span lengths won't match.

// Since we allocated, we are responsible for freeing the pointer.
CArrowArray.Free(cExportArray);
CArrowSchema.Free(cExportSchema);
CArrowArray.Free(cImportArray);
CArrowSchema.Free(cImportSchema);
}

[SkippableFact]
public unsafe void ExportBatchReader()
{
Expand Down

0 comments on commit 9ba789d

Please sign in to comment.