From 1f8105b6519837324e054dbc355fa1d74c35c31f Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Sat, 14 Sep 2024 06:38:13 -0700 Subject: [PATCH 1/2] Correctly import sliced arrays through the C Data interface. --- csharp/src/Apache.Arrow/Apache.Arrow.csproj | 6 +-- .../src/Apache.Arrow/C/CArrowArrayImporter.cs | 29 +++++++------ csharp/src/Apache.Arrow/RecordBatch.cs | 11 +++++ csharp/src/Apache.Arrow/Utility.cs | 2 - .../Apache.Arrow.Tests/ArrowReaderVerifier.cs | 4 +- .../CDataInterfaceDataTests.cs | 18 ++++++++ .../CDataInterfacePythonTests.cs | 43 +++++++++++++++++++ 7 files changed, 94 insertions(+), 19 deletions(-) diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj index 034876a114b0b..a845f8e693695 100644 --- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj +++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj @@ -7,18 +7,16 @@ Apache Arrow is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware. - + netstandard2.0;net6.0;net8.0;net462 - - netstandard2.0;net6.0;net8.0 - + diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index 68b67f3d7c620..bac2113f535ca 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -148,6 +148,11 @@ public RecordBatch GetAsRecordBatch(Schema schema) } private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type) + { + return GetAsArrayData(cArray, type, checked((int)cArray->offset), checked((int)cArray->length)); + } + + private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type, int offset, int length) { ArrayData[] children = null; ArrowBuffer[] buffers = null; @@ -239,7 +244,7 @@ private ArrayData[] ProcessListChildren(CArrowArray* cArray, IArrowType type) } ArrayData[] children = new ArrayData[1]; - children[0] = GetAsArrayData(cArray->children[0], type); + children[0] = GetAsArrayData(cArray->children[0], type, checked((int)cArray->offset), checked((int)cArray->length)); return children; } @@ -260,7 +265,7 @@ private ArrayData[] ProcessStructChildren(CArrowArray* cArray, IReadOnlyListlength); + int length = checked((int)cArray->offset + (int)cArray->length); int validityLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8); return (cArray->buffers[0] == null) ? ArrowBuffer.Empty : new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, validityLength)); } @@ -285,7 +290,7 @@ private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray) throw new InvalidOperationException("Byte arrays are expected to have exactly three buffers"); } - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = (length + 1) * 4; int* offsets = (int*)cArray->buffers[1]; Debug.Assert(offsets != null); @@ -306,7 +311,7 @@ private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray) throw new InvalidOperationException("Byte array views are expected to have at least three buffers"); } - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int viewsLength = length * 16; long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1]; @@ -336,7 +341,7 @@ private ArrowBuffer[] ImportLargeByteArrayBuffers(CArrowArray* cArray) $"is greater than the maximum supported large byte array length ({maxLength})"); } - int length = (int)cArray->length; + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = (length + 1) * 8; long* offsets = (long*)cArray->buffers[1]; Debug.Assert(offsets != null); @@ -364,7 +369,7 @@ private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray) throw new InvalidOperationException("List arrays are expected to have exactly two buffers"); } - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = (length + 1) * 4; ArrowBuffer[] buffers = new ArrowBuffer[2]; @@ -381,7 +386,7 @@ private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray) throw new InvalidOperationException("List view arrays are expected to have exactly three buffers"); } - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = length * 4; ArrowBuffer[] buffers = new ArrowBuffer[3]; @@ -407,7 +412,7 @@ private ArrowBuffer[] ImportLargeListBuffers(CArrowArray* cArray) $"is greater than the maximum supported large list array length ({maxLength})"); } - int length = (int)cArray->length; + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = (length + 1) * 8; ArrowBuffer[] buffers = new ArrowBuffer[2]; @@ -436,7 +441,7 @@ private ArrowBuffer[] ImportDenseUnionBuffers(CArrowArray* cArray) { throw new InvalidOperationException("Dense union arrays are expected to have exactly two children"); } - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = length * 4; ArrowBuffer[] buffers = new ArrowBuffer[2]; @@ -454,7 +459,7 @@ private ArrowBuffer[] ImportSparseUnionBuffers(CArrowArray* cArray) } ArrowBuffer[] buffers = new ArrowBuffer[1]; - buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->length)); + buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->offset + (int)cArray->length)); return buffers; } @@ -467,10 +472,10 @@ private ArrowBuffer[] ImportFixedWidthBuffers(CArrowArray* cArray, int bitWidth) } // validity, data - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int valuesLength; if (bitWidth >= 8) - valuesLength = checked((int)(cArray->length * bitWidth / 8)); + valuesLength = checked(length * bitWidth / 8); else valuesLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8); diff --git a/csharp/src/Apache.Arrow/RecordBatch.cs b/csharp/src/Apache.Arrow/RecordBatch.cs index 9cc81b1648ea8..4067ba9ac6c2b 100644 --- a/csharp/src/Apache.Arrow/RecordBatch.cs +++ b/csharp/src/Apache.Arrow/RecordBatch.cs @@ -100,6 +100,17 @@ public RecordBatch Clone(MemoryAllocator allocator = default) return new RecordBatch(Schema, arrays, Length); } + public RecordBatch Slice(int offset, int length) + { + if (offset > Length) + { + throw new ArgumentException($"Offset {offset} cannot be greater than Length {Length} for RecordBatch.Slice"); + } + + length = Math.Min(Length - offset, length); + return new RecordBatch(Schema, _arrays.Select(a => ArrowArrayFactory.Slice(a, offset, length)), length); + } + public void Accept(IArrowArrayVisitor visitor) { switch (visitor) diff --git a/csharp/src/Apache.Arrow/Utility.cs b/csharp/src/Apache.Arrow/Utility.cs index c4e5732e6eaa7..22b3ff15f1c5c 100644 --- a/csharp/src/Apache.Arrow/Utility.cs +++ b/csharp/src/Apache.Arrow/Utility.cs @@ -13,10 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -using Apache.Arrow.Flatbuf; using System; using System.Collections.Generic; -using System.Text; namespace Apache.Arrow { diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index 85f7b75f931ef..35b2c4e7f2ad3 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -566,7 +566,9 @@ private void CompareArrays(FixedSizeListArray actualArray) var listSize = ((FixedSizeListType)expectedArray.Data.DataType).ListSize; var expectedValuesSlice = ArrowArrayFactory.Slice( expectedArray.Values, expectedArray.Offset * listSize, expectedArray.Length * listSize); - actualArray.Values.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare)); + var actualValuesSlice = ArrowArrayFactory.Slice( + actualArray.Values, actualArray.Offset * listSize, actualArray.Length * listSize); + actualValuesSlice.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare)); } private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer expectedValidityBuffer, int expectedBufferOffset, ArrowBuffer actualValidityBuffer, int actualBufferOffset) diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs index 2bd4d4d661942..70ab1fdae2f64 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs @@ -92,5 +92,23 @@ public unsafe void CallsReleaseForInvalid() GC.KeepAlive(releaseCallback); } #endif + + [Fact] + public unsafe void RoundTripInt32ArrayWithOffset() + { + Int32Array array = new Int32Array.Builder() + .AppendRange(new[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }) + .Build(); + IArrowArray sliced = array.Slice(2, 6); + CArrowArray* cArray = CArrowArray.Create(); + CArrowArrayExporter.ExportArray(sliced, cArray); + using (var importedSlice = (Int32Array)CArrowArrayImporter.ImportArray(cArray, array.Data.DataType)) + { + Assert.Equal(6, importedSlice.Length); + Assert.Equal(2, importedSlice.Offset); + Assert.Equal(2, importedSlice.GetValue(0)); + } + CArrowArray.Free(cArray); + } } } diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index fee18d165cdbd..638cbfb272de4 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -792,6 +792,49 @@ public unsafe void RoundTripTestBatch() CArrowSchema.Free(cImportSchema); } + [SkippableFact] + public unsafe void RoundTripTestSlicedBatch() + { + // TODO: Enable these once this the version of pyarrow referenced during testing supports them + HashSet unsupported = new HashSet { ArrowTypeId.ListView, ArrowTypeId.BinaryView, ArrowTypeId.StringView }; + RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, excludedTypes: unsupported); + RecordBatch batch1slice = batch1.Slice(1, 2); + RecordBatch batch2 = batch1slice.Clone(); + + CArrowArray* cExportArray = CArrowArray.Create(); + CArrowArrayExporter.ExportRecordBatch(batch1slice, cExportArray); + + CArrowSchema* cExportSchema = CArrowSchema.Create(); + CArrowSchemaExporter.ExportSchema(batch1.Schema, cExportSchema); + + CArrowArray* cImportArray = CArrowArray.Create(); + CArrowSchema* cImportSchema = CArrowSchema.Create(); + + // For Python, we need to provide the pointers + long exportArrayPtr = ((IntPtr)cExportArray).ToInt64(); + long exportSchemaPtr = ((IntPtr)cExportSchema).ToInt64(); + long importArrayPtr = ((IntPtr)cImportArray).ToInt64(); + long importSchemaPtr = ((IntPtr)cImportSchema).ToInt64(); + + using (Py.GIL()) + { + dynamic pa = Py.Import("pyarrow"); + dynamic exportedPyArray = pa.RecordBatch._import_from_c(exportArrayPtr, exportSchemaPtr); + exportedPyArray._export_to_c(importArrayPtr, importSchemaPtr); + } + + Schema schema = CArrowSchemaImporter.ImportSchema(cImportSchema); + RecordBatch importedBatch = CArrowArrayImporter.ImportRecordBatch(cImportArray, schema); + + ArrowReaderVerifier.CompareBatches(batch2, importedBatch, strictCompare: false); // Non-strict because span lengths won't match. + + // Since we allocated, we are responsible for freeing the pointer. + CArrowArray.Free(cExportArray); + CArrowSchema.Free(cExportSchema); + CArrowArray.Free(cImportArray); + CArrowSchema.Free(cImportSchema); + } + [SkippableFact] public unsafe void ExportBatchReader() { From 46bf41bf936eb34d1353f68cb1a80d67d19a9567 Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Sat, 14 Sep 2024 06:43:50 -0700 Subject: [PATCH 2/2] Revert unneeded changes --- csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index bac2113f535ca..c454380e17cfc 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -148,11 +148,6 @@ public RecordBatch GetAsRecordBatch(Schema schema) } private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type) - { - return GetAsArrayData(cArray, type, checked((int)cArray->offset), checked((int)cArray->length)); - } - - private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type, int offset, int length) { ArrayData[] children = null; ArrowBuffer[] buffers = null; @@ -244,7 +239,7 @@ private ArrayData[] ProcessListChildren(CArrowArray* cArray, IArrowType type) } ArrayData[] children = new ArrayData[1]; - children[0] = GetAsArrayData(cArray->children[0], type, checked((int)cArray->offset), checked((int)cArray->length)); + children[0] = GetAsArrayData(cArray->children[0], type); return children; }