Skip to content

Commit d6ef480

Browse files
paleolimbotWillAyd
andauthored
feat(python): Add StringView and BinaryView IO to Python bindings (#637)
This PR implements StringView support in the Python bindings. It is a thin wrapper around the C functions added, although we should perhaps abstract some of the buffer info calculation into the C library since I had to work around that in the R bindings as well. ```python import nanoarrow as na array = na.Array(["abc", "def", None, "longer than 12 bytes"], na.string_view()) array #> nanoarrow.Array<string_view>[4] #> 'abc' #> 'def' #> None #> 'longer than 12 bytes' array.buffers #> (nanoarrow.c_buffer.CBufferView(bool[1 b] 11010000), #> nanoarrow.c_buffer.CBufferView(string_view[64 b] b'\x03\x00\x00\x00abc\x00\x00\x00\x00\x00\x00\x00\x00\x00'...), #> nanoarrow.c_buffer.CBufferView(string[20 b] b'longer than 12 bytes'), #> nanoarrow.c_buffer.CBufferView(int64[8 b] 20)) ``` --------- Co-authored-by: William Ayd <[email protected]>
1 parent 97e7c61 commit d6ef480

File tree

8 files changed

+163
-18
lines changed

8 files changed

+163
-18
lines changed

python/src/nanoarrow/_array.pyx

+73-10
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@ from cpython.pycapsule cimport PyCapsule_GetPointer
2222
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
2323
from cpython cimport (
2424
Py_buffer,
25-
PyObject_GetBuffer,
2625
PyBuffer_Release,
2726
PyBUF_ANY_CONTIGUOUS,
2827
PyBUF_FORMAT,
28+
PyBytes_FromStringAndSize,
29+
PyObject_GetBuffer,
30+
PyUnicode_FromStringAndSize,
2931
)
3032

3133
from nanoarrow_c cimport (
@@ -43,6 +45,9 @@ from nanoarrow_c cimport (
4345
ArrowArrayView,
4446
ArrowArrayViewComputeNullCount,
4547
ArrowArrayViewInitFromSchema,
48+
ArrowArrayViewIsNull,
49+
ArrowArrayViewGetStringUnsafe,
50+
ArrowArrayViewGetBytesUnsafe,
4651
ArrowArrayViewSetArray,
4752
ArrowArrayViewSetArrayMinimal,
4853
ArrowBitCountSet,
@@ -57,6 +62,7 @@ from nanoarrow_c cimport (
5762
ArrowValidationLevel,
5863
NANOARROW_BUFFER_TYPE_DATA,
5964
NANOARROW_BUFFER_TYPE_DATA_OFFSET,
65+
NANOARROW_BUFFER_TYPE_DATA_VIEW,
6066
NANOARROW_BUFFER_TYPE_TYPE_ID,
6167
NANOARROW_BUFFER_TYPE_UNION_OFFSET,
6268
NANOARROW_BUFFER_TYPE_VALIDITY,
@@ -78,6 +84,7 @@ from nanoarrow._device cimport Device, CSharedSyncEvent
7884

7985
from nanoarrow._buffer cimport CBuffer, CBufferView
8086
from nanoarrow._schema cimport CSchema, CLayout
87+
from nanoarrow cimport _types
8188
from nanoarrow._utils cimport (
8289
alloc_c_array,
8390
alloc_c_device_array,
@@ -189,13 +196,48 @@ cdef class CArrayView:
189196

190197
@property
191198
def n_buffers(self):
199+
if _types.is_data_view(self._ptr.storage_type):
200+
return 2 + self._ptr.n_variadic_buffers + 1
201+
192202
return self.layout.n_buffers
193203

194-
def buffer_type(self, int64_t i):
204+
def _buffer_info(self, int64_t i):
195205
if i < 0 or i >= self.n_buffers:
196206
raise IndexError(f"{i} out of range [0, {self.n_buffers}]")
197207

198-
buffer_type = self._ptr.layout.buffer_type[i]
208+
if (
209+
_types.is_data_view(self._ptr.storage_type)
210+
and i == (2 + self._ptr.n_variadic_buffers)
211+
):
212+
return (
213+
NANOARROW_BUFFER_TYPE_DATA,
214+
_types.INT64,
215+
64,
216+
<uintptr_t>self._ptr.array.buffers[i],
217+
(self._ptr.n_variadic_buffers) * 8
218+
)
219+
elif (
220+
_types.is_data_view(self._ptr.storage_type)
221+
and i >= 2
222+
):
223+
return (
224+
NANOARROW_BUFFER_TYPE_DATA,
225+
_types.STRING if int(self._ptr.storage_type) == _types.STRING_VIEW else _types.BINARY,
226+
0,
227+
<uintptr_t>self._ptr.array.buffers[i],
228+
(<int64_t*>self._ptr.array.buffers[2 + self._ptr.n_variadic_buffers])[i - 2]
229+
)
230+
231+
return (
232+
self._ptr.layout.buffer_type[i],
233+
self._ptr.layout.buffer_data_type[i],
234+
self._ptr.layout.element_size_bits[i],
235+
<uintptr_t>self._ptr.buffer_views[i].data.data,
236+
self._ptr.buffer_views[i].size_bytes
237+
)
238+
239+
def buffer_type(self, int64_t i):
240+
buffer_type = self._buffer_info(i)[0]
199241
if buffer_type == NANOARROW_BUFFER_TYPE_VALIDITY:
200242
return "validity"
201243
elif buffer_type == NANOARROW_BUFFER_TYPE_TYPE_ID:
@@ -206,14 +248,17 @@ cdef class CArrayView:
206248
return "data_offset"
207249
elif buffer_type == NANOARROW_BUFFER_TYPE_DATA:
208250
return "data"
251+
elif buffer_type == NANOARROW_BUFFER_TYPE_DATA_VIEW:
252+
return "data_view"
209253
else:
210254
return "none"
211255

212256
def buffer(self, int64_t i):
213-
if i < 0 or i >= self.n_buffers:
214-
raise IndexError(f"{i} out of range [0, {self.n_buffers}]")
257+
_, data_type, element_size_bits, addr, size = self._buffer_info(i)
215258

216-
cdef ArrowBufferView* buffer_view = &(self._ptr.buffer_views[i])
259+
cdef ArrowBufferView buffer_view
260+
buffer_view.data.data = <void*>addr
261+
buffer_view.size_bytes = size
217262

218263
# Check the buffer size here because the error later is cryptic.
219264
# Buffer sizes are set to -1 when they are "unknown", so because of errors
@@ -224,10 +269,10 @@ cdef class CArrayView:
224269

225270
return CBufferView(
226271
self._array_base,
227-
<uintptr_t>buffer_view.data.data,
228-
buffer_view.size_bytes,
229-
self._ptr.layout.buffer_data_type[i],
230-
self._ptr.layout.element_size_bits[i],
272+
addr,
273+
size,
274+
data_type,
275+
element_size_bits,
231276
self._event
232277
)
233278

@@ -249,6 +294,24 @@ cdef class CArrayView:
249294

250295
return dictionary
251296

297+
def _iter_bytes(self, int64_t offset, int64_t length) -> bytes | None:
298+
cdef ArrowBufferView item_view
299+
for i in range(offset, length):
300+
if ArrowArrayViewIsNull(self._ptr, i):
301+
yield None
302+
else:
303+
item_view = ArrowArrayViewGetBytesUnsafe(self._ptr, i)
304+
yield PyBytes_FromStringAndSize(item_view.data.as_char, item_view.size_bytes)
305+
306+
def _iter_str(self, int64_t offset, int64_t length) -> str | None:
307+
cdef ArrowStringView item_view
308+
for i in range(offset, length):
309+
if ArrowArrayViewIsNull(self._ptr, i):
310+
yield None
311+
else:
312+
item_view = ArrowArrayViewGetStringUnsafe(self._ptr, i)
313+
yield PyUnicode_FromStringAndSize(item_view.data, item_view.size_bytes)
314+
252315
def __repr__(self):
253316
return _repr_utils.array_view_repr(self)
254317

python/src/nanoarrow/_types.pxd

+2
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ cpdef bint has_time_unit(int type_id)
9090

9191
cpdef bint is_union(int type_id)
9292

93+
cpdef bint is_data_view(int type_id)
94+
9395
cdef int to_format(int type_id, int element_size_bits, size_t out_size, char* out)
9496

9597
cdef tuple from_format(format)

python/src/nanoarrow/_types.pyi

+7
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import enum
2020
from typing import Callable, ClassVar
2121

2222
BINARY: CArrowType
23+
BINARY_VIEW: CArrowType
2324
BOOL: CArrowType
2425
DATE32: CArrowType
2526
DATE64: CArrowType
@@ -47,8 +48,10 @@ LARGE_STRING: CArrowType
4748
LIST: CArrowType
4849
MAP: CArrowType
4950
NA: CArrowType
51+
RUN_END_ENCODED: CArrowType
5052
SPARSE_UNION: CArrowType
5153
STRING: CArrowType
54+
STRING_VIEW: CArrowType
5255
STRUCT: CArrowType
5356
TIME32: CArrowType
5457
TIME64: CArrowType
@@ -61,6 +64,7 @@ UNINITIALIZED: CArrowType
6164
__pyx_capi__: dict
6265
__test__: dict
6366
has_time_unit: _cython_3_0_11.cython_function_or_method
67+
is_data_view: _cython_3_0_11.cython_function_or_method
6468
is_decimal: _cython_3_0_11.cython_function_or_method
6569
is_fixed_size: _cython_3_0_11.cython_function_or_method
6670
is_floating_point: _cython_3_0_11.cython_function_or_method
@@ -72,6 +76,7 @@ sys_byteorder: str
7276
class CArrowType(enum.IntFlag):
7377
__new__: ClassVar[Callable] = ...
7478
BINARY: ClassVar[CArrowType] = ...
79+
BINARY_VIEW: ClassVar[CArrowType] = ...
7580
BOOL: ClassVar[CArrowType] = ...
7681
DATE32: ClassVar[CArrowType] = ...
7782
DATE64: ClassVar[CArrowType] = ...
@@ -99,8 +104,10 @@ class CArrowType(enum.IntFlag):
99104
LIST: ClassVar[CArrowType] = ...
100105
MAP: ClassVar[CArrowType] = ...
101106
NA: ClassVar[CArrowType] = ...
107+
RUN_END_ENCODED: ClassVar[CArrowType] = ...
102108
SPARSE_UNION: ClassVar[CArrowType] = ...
103109
STRING: ClassVar[CArrowType] = ...
110+
STRING_VIEW: ClassVar[CArrowType] = ...
104111
STRUCT: ClassVar[CArrowType] = ...
105112
TIME32: ClassVar[CArrowType] = ...
106113
TIME64: ClassVar[CArrowType] = ...

python/src/nanoarrow/_types.pyx

+11
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,14 @@ cpdef bint is_union(int type_id):
109109
)
110110

111111

112+
cpdef bint is_data_view(int type_id):
113+
"""Check if type_id is a binary view or string view type"""
114+
return type_id in (
115+
_types.BINARY_VIEW,
116+
_types.STRING_VIEW
117+
)
118+
119+
112120
cdef tuple from_format(format):
113121
"""Convert a Python buffer protocol format string to a itemsize/type_id tuple
114122
@@ -236,6 +244,9 @@ cdef int to_format(int type_id, int element_size_bits, size_t out_size, char* ou
236244
elif type_id == _types.DECIMAL256:
237245
format_const = "32s"
238246
element_size_bits_calc = 256
247+
elif is_data_view(type_id):
248+
format_const = "16s"
249+
element_size_bits_calc = 128
239250
else:
240251
raise ValueError(f"Unsupported Arrow type_id for format conversion: {type_id}")
241252

python/src/nanoarrow/c_array.py

+2
Original file line numberDiff line numberDiff line change
@@ -547,8 +547,10 @@ def _append_using_buffer_builder(self, obj: Iterable) -> None:
547547
_types.BINARY: "_append_bytes",
548548
_types.LARGE_BINARY: "_append_bytes",
549549
_types.FIXED_SIZE_BINARY: "_append_bytes",
550+
_types.BINARY_VIEW: "_append_bytes",
550551
_types.STRING: "_append_strings",
551552
_types.LARGE_STRING: "_append_strings",
553+
_types.STRING_VIEW: "_append_strings",
552554
_types.INT8: "_append_using_array",
553555
_types.UINT8: "_append_using_array",
554556
_types.INT16: "_append_using_array",

python/src/nanoarrow/iterator.py

+8
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,12 @@ def _binary_iter(self, offset, length):
322322
for start, end in zip(starts, ends):
323323
yield bytes(data[start:end])
324324

325+
def _binary_view_iter(self, offset, length):
326+
return self._array_view._iter_bytes(offset, length)
327+
328+
def _string_view_iter(self, offset, length):
329+
return self._array_view._iter_str(offset, length)
330+
325331
def _decimal_iter(self, offset, length):
326332
from decimal import Context, Decimal
327333
from sys import byteorder
@@ -564,6 +570,8 @@ def _get_tzinfo(tz_string, strategy=None):
564570
_types.DURATION: "_duration_iter",
565571
_types.DECIMAL128: "_decimal_iter",
566572
_types.DECIMAL256: "_decimal_iter",
573+
_types.STRING_VIEW: "_string_view_iter",
574+
_types.BINARY_VIEW: "_binary_view_iter",
567575
}
568576

569577
_PRIMITIVE_TYPE_NAMES = [

python/tests/test_c_array.py

+40
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,32 @@ def test_c_array_from_iterable_string():
288288
na.c_array([b"1234"], na.string())
289289

290290

291+
def test_c_array_from_iterable_string_view():
292+
string = na.c_array(
293+
["abc", None, "a string longer than 12 bytes"], na.string_view()
294+
)
295+
assert string.length == 3
296+
assert string.null_count == 1
297+
assert string.n_buffers == 4
298+
299+
array_view = string.view()
300+
assert len(array_view.buffer(0)) == 1
301+
assert bytes(array_view.buffer(2)) == b"a string longer than 12 bytes"
302+
assert list(array_view.buffer(3)) == [len("a string longer than 12 bytes")]
303+
304+
# Make sure this also works when all strings are inlined (i.e., no variadic buffers)
305+
string = na.c_array(["abc", None, "short string"], na.string_view())
306+
assert string.length == 3
307+
assert string.null_count == 1
308+
assert string.n_buffers == 3
309+
310+
array_view = string.view()
311+
assert len(array_view.buffer(0)) == 1
312+
assert len(array_view.buffer(1)) == 3
313+
assert len(bytes(array_view.buffer(1))) == 3 * 16
314+
assert list(array_view.buffer(2)) == []
315+
316+
291317
def test_c_array_from_iterable_bytes():
292318
string = na.c_array([b"abc", None, b"defg"], na.binary())
293319
assert string.length == 3
@@ -311,6 +337,20 @@ def test_c_array_from_iterable_bytes():
311337
na.c_array([buf_2d], na.binary())
312338

313339

340+
def test_c_array_from_iterable__view():
341+
string = na.c_array(
342+
[b"abc", None, b"a string longer than 12 bytes"], na.binary_view()
343+
)
344+
assert string.length == 3
345+
assert string.null_count == 1
346+
assert string.n_buffers == 4
347+
348+
array_view = string.view()
349+
assert len(array_view.buffer(0)) == 1
350+
assert bytes(array_view.buffer(2)) == b"a string longer than 12 bytes"
351+
assert list(array_view.buffer(3)) == [len("a string longer than 12 bytes")]
352+
353+
314354
def test_c_array_from_iterable_non_empty_nullable_without_nulls():
315355
c_array = na.c_array([1, 2, 3], na.int32())
316356
assert c_array.length == 3

python/tests/test_iterator.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -68,35 +68,47 @@ def test_iterator_nullable_primitive():
6868
assert list(iter_py(sliced)) == [2, 3, None]
6969

7070

71-
def test_iterator_string():
72-
array = na.c_array(["ab", "cde"], na.string())
71+
@pytest.mark.parametrize(
72+
"arrow_type", [na.string(), na.large_string(), na.string_view()]
73+
)
74+
def test_iterator_string(arrow_type):
75+
array = na.c_array(["ab", "cde"], arrow_type)
7376

7477
assert list(iter_py(array)) == ["ab", "cde"]
7578

7679
sliced = array[1:]
7780
assert list(iter_py(sliced)) == ["cde"]
7881

7982

80-
def test_iterator_nullable_string():
81-
array = na.c_array(["ab", "cde", None], na.string())
83+
@pytest.mark.parametrize(
84+
"arrow_type", [na.string(), na.large_string(), na.string_view()]
85+
)
86+
def test_iterator_nullable_string(arrow_type):
87+
array = na.c_array(["ab", "cde", None], arrow_type)
8288

8389
assert list(iter_py(array)) == ["ab", "cde", None]
8490

8591
sliced = array[1:]
8692
assert list(iter_py(sliced)) == ["cde", None]
8793

8894

89-
def test_iterator_binary():
90-
array = na.c_array([b"ab", b"cde"], na.binary())
95+
@pytest.mark.parametrize(
96+
"arrow_type", [na.binary(), na.large_binary(), na.binary_view()]
97+
)
98+
def test_iterator_binary(arrow_type):
99+
array = na.c_array([b"ab", b"cde"], arrow_type)
91100

92101
assert list(iter_py(array)) == [b"ab", b"cde"]
93102

94103
sliced = array[1:]
95104
assert list(iter_py(sliced)) == [b"cde"]
96105

97106

98-
def test_iterator_nullable_binary():
99-
array = na.c_array([b"ab", b"cde", None], na.binary())
107+
@pytest.mark.parametrize(
108+
"arrow_type", [na.binary(), na.large_binary(), na.binary_view()]
109+
)
110+
def test_iterator_nullable_binary(arrow_type):
111+
array = na.c_array([b"ab", b"cde", None], arrow_type)
100112

101113
assert list(iter_py(array)) == [b"ab", b"cde", None]
102114

0 commit comments

Comments
 (0)