Skip to content

Commit 3aa0ec1

Browse files
feat(python): Implement bindings to IPC writer (#586)
This PR implements bindings to the IPC writer in the nanoarrow C library. This adds: - An `ipc.StreamWriter()` class roughly mirroring pyarrow's `ipc.Stream()` - `Schema.serialize()` and `Array.serialize()` to match pyarrow's `serialize()` methods. ```python import io import nanoarrow as na from nanoarrow.ipc import StreamWriter, InputStream out = io.BytesIO() writer = StreamWriter.from_writable(out) writer.write_stream(InputStream.example()) na.Array(InputStream.from_readable(out.getvalue())) #> nanoarrow.Array<non-nullable struct<some_col: int32>>[3] #> {'some_col': 1} #> {'some_col': 2} #> {'some_col': 3} ``` --------- Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 43001a9 commit 3aa0ec1

File tree

10 files changed

+699
-75
lines changed

10 files changed

+699
-75
lines changed

dev/benchmarks/python/ipc.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,12 @@ def fixture_path(self, name):
4343
return os.path.join(self.fixtures_dir, name)
4444

4545
def read_fixture_file(self, name):
46-
with ipc.Stream.from_path(self.fixture_path(name)) as in_stream:
46+
with ipc.InputStream.from_path(self.fixture_path(name)) as in_stream:
4747
list(na.c_array_stream(in_stream))
4848

4949
def read_fixture_buffer(self, name):
5050
f = io.BytesIO(self.fixture_buffer[name])
51-
with ipc.Stream.from_readable(f) as in_stream:
51+
with ipc.InputStream.from_readable(f) as in_stream:
5252
list(na.c_array_stream(in_stream))
5353

5454
def time_read_float64_basic_file(self):

python/src/nanoarrow/_ipc_lib.pyx

+183-12
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,24 @@
1919
# cython: linetrace=True
2020

2121
from libc.stdint cimport uint8_t, int64_t, uintptr_t
22-
from libc.errno cimport EIO
22+
from libc.errno cimport EIO, EAGAIN
2323
from libc.stdio cimport snprintf
2424
from cpython.ref cimport PyObject, Py_INCREF, Py_DECREF
2525
from cpython cimport Py_buffer, PyBuffer_FillInfo
2626

2727
from nanoarrow_c cimport (
28+
ArrowArrayStream,
29+
ArrowArrayView,
30+
ArrowSchema,
2831
ArrowErrorCode,
2932
ArrowError,
3033
NANOARROW_OK,
31-
ArrowArrayStream,
3234
)
3335

36+
from nanoarrow._schema cimport CSchema
37+
from nanoarrow._array cimport CArrayView
38+
from nanoarrow._utils cimport Error
39+
3440

3541
cdef extern from "nanoarrow_ipc.h" nogil:
3642
struct ArrowIpcInputStream:
@@ -48,18 +54,43 @@ cdef extern from "nanoarrow_ipc.h" nogil:
4854
ArrowArrayStream* out, ArrowIpcInputStream* input_stream,
4955
ArrowIpcArrayStreamReaderOptions* options)
5056

57+
struct ArrowIpcOutputStream:
58+
ArrowErrorCode (*write)(ArrowIpcOutputStream* stream, const void* buf,
59+
int64_t buf_size_bytes, int64_t* size_written_out,
60+
ArrowError* error)
61+
void (*release)(ArrowIpcOutputStream* stream)
62+
void* private_data
5163

52-
cdef class PyInputStreamPrivate:
64+
struct ArrowIpcWriter:
65+
void* private_data
66+
67+
ArrowErrorCode ArrowIpcWriterInit(ArrowIpcWriter* writer,
68+
ArrowIpcOutputStream* output_stream)
69+
void ArrowIpcWriterReset(ArrowIpcWriter* writer)
70+
ArrowErrorCode ArrowIpcWriterWriteSchema(ArrowIpcWriter* writer,
71+
const ArrowSchema* in_,
72+
ArrowError* error)
73+
ArrowErrorCode ArrowIpcWriterWriteArrayView(ArrowIpcWriter* writer,
74+
const ArrowArrayView* in_,
75+
ArrowError* error)
76+
77+
ArrowErrorCode ArrowIpcWriterWriteArrayStream(ArrowIpcWriter* writer,
78+
ArrowArrayStream* in_,
79+
ArrowError* error)
80+
81+
cdef class PyStreamPrivate:
5382
cdef object _obj
5483
cdef bint _close_obj
5584
cdef void* _addr
5685
cdef Py_ssize_t _size_bytes
86+
cdef bint _buffer_readonly
5787

58-
def __cinit__(self, obj, close_obj=False):
88+
def __cinit__(self, obj, bint buffer_readonly, bint close_obj=False):
5989
self._obj = obj
6090
self._close_obj = close_obj
6191
self._addr = NULL
6292
self._size_bytes = 0
93+
self._buffer_readonly = buffer_readonly
6394

6495
@property
6596
def obj(self):
@@ -78,14 +109,16 @@ cdef class PyInputStreamPrivate:
78109
return self._size_bytes
79110

80111
# Implement the buffer protocol so that this object can be used as
81-
# the argument to xxx.readinto(). This ensures that no extra copies
82-
# (beyond any buffering done by the upstream file-like object) are held
83-
# since the upstream object has access to the preallocated output buffer.
84-
# In this case, the preallocation is done by the ArrowArrayStream
112+
# the argument to xxx.readinto() or xxx.write(). This ensures that
113+
# no extra copies (beyond any buffering done by the upstream file-like object)
114+
# are held since the upstream object has access to the preallocated output buffer.
115+
# In the read case, the preallocation is done by the ArrowArrayStream
85116
# implementation before issuing each read call (two per message, with
86117
# an extra call for a RecordBatch message to get the actual buffer data).
118+
# In the write case, this will be a view of whatever information was provided to
119+
# the write callback.
87120
def __getbuffer__(self, Py_buffer* buffer, int flags):
88-
PyBuffer_FillInfo(buffer, self, self._addr, self._size_bytes, 0, flags)
121+
PyBuffer_FillInfo(buffer, self, self._addr, self._size_bytes, self._buffer_readonly, flags)
89122

90123
def __releasebuffer__(self, Py_buffer* buffer):
91124
pass
@@ -100,8 +133,16 @@ cdef ArrowErrorCode py_input_stream_read(ArrowIpcInputStream* stream, uint8_t* b
100133
stream_private.set_buffer(<uintptr_t>buf, buf_size_bytes)
101134

102135
try:
103-
size_read_out[0] = stream_private.obj.readinto(stream_private)
104-
return NANOARROW_OK
136+
# Non-blocking streams may return None here, or buffered
137+
# wrappers of them may raise BufferedIOError
138+
read_result = stream_private.obj.readinto(stream_private)
139+
140+
if read_result is None:
141+
size_read_out[0] = 0
142+
return EAGAIN
143+
else:
144+
size_read_out[0] = read_result
145+
return NANOARROW_OK
105146
except Exception as e:
106147
cls = type(e).__name__.encode()
107148
msg = str(e).encode()
@@ -126,6 +167,51 @@ cdef void py_input_stream_release(ArrowIpcInputStream* stream) noexcept nogil:
126167
stream.release = NULL
127168

128169

170+
171+
cdef ArrowErrorCode py_output_stream_write(ArrowIpcOutputStream* stream, const void* buf,
172+
int64_t buf_size_bytes, int64_t* size_written_out,
173+
ArrowError* error) noexcept nogil:
174+
175+
with gil:
176+
stream_private = <object>stream.private_data
177+
stream_private.set_buffer(<uintptr_t>buf, buf_size_bytes)
178+
179+
try:
180+
# Non-blocking streams may return None here, or buffered
181+
# wrappers of them may raise BufferedIOError
182+
write_result = stream_private.obj.write(stream_private)
183+
184+
# Non-blocking streams may return None here
185+
if write_result is None:
186+
size_written_out[0] = 0
187+
return EAGAIN
188+
else:
189+
size_written_out[0] = write_result
190+
return NANOARROW_OK
191+
except Exception as e:
192+
cls = type(e).__name__.encode()
193+
msg = str(e).encode()
194+
snprintf(
195+
error.message,
196+
sizeof(error.message),
197+
"%s: %s",
198+
<const char*>cls,
199+
<const char*>msg
200+
)
201+
return EIO
202+
203+
cdef void py_output_stream_release(ArrowIpcOutputStream* stream) noexcept nogil:
204+
with gil:
205+
stream_private = <object>stream.private_data
206+
if stream_private.close_obj:
207+
stream_private.obj.close()
208+
209+
Py_DECREF(stream_private)
210+
211+
stream.private_data = NULL
212+
stream.release = NULL
213+
214+
129215
cdef class CIpcInputStream:
130216
cdef ArrowIpcInputStream _stream
131217

@@ -150,7 +236,11 @@ cdef class CIpcInputStream:
150236
@staticmethod
151237
def from_readable(obj, close_obj=False):
152238
cdef CIpcInputStream stream = CIpcInputStream()
153-
cdef PyInputStreamPrivate private_data = PyInputStreamPrivate(obj, close_obj)
239+
cdef PyStreamPrivate private_data = PyStreamPrivate(
240+
obj,
241+
buffer_readonly=False,
242+
close_obj=close_obj
243+
)
154244

155245
stream._stream.private_data = <PyObject*>private_data
156246
Py_INCREF(private_data)
@@ -166,3 +256,84 @@ def init_array_stream(CIpcInputStream input_stream, uintptr_t out):
166256
cdef int code = ArrowIpcArrayStreamReaderInit(out_ptr, &input_stream._stream, NULL)
167257
if code != NANOARROW_OK:
168258
raise RuntimeError(f"ArrowIpcArrayStreamReaderInit() failed with code [{code}]")
259+
260+
261+
cdef class CIpcOutputStream:
262+
cdef ArrowIpcOutputStream _stream
263+
264+
def __cinit__(self):
265+
self._stream.release = NULL
266+
267+
def is_valid(self):
268+
return self._stream.release != NULL
269+
270+
def __dealloc__(self):
271+
# Duplicating release() to avoid Python API calls in the deallocator
272+
if self._stream.release != NULL:
273+
self._stream.release(&self._stream)
274+
275+
def release(self):
276+
if self._stream.release != NULL:
277+
self._stream.release(&self._stream)
278+
return True
279+
else:
280+
return False
281+
282+
@staticmethod
283+
def from_writable(obj, close_obj=False):
284+
cdef CIpcOutputStream stream = CIpcOutputStream()
285+
cdef PyStreamPrivate private_data = PyStreamPrivate(
286+
obj,
287+
buffer_readonly=True,
288+
close_obj=close_obj
289+
)
290+
291+
stream._stream.private_data = <PyObject*>private_data
292+
Py_INCREF(private_data)
293+
stream._stream.write = &py_output_stream_write
294+
stream._stream.release = &py_output_stream_release
295+
return stream
296+
297+
298+
cdef class CIpcWriter:
299+
cdef ArrowIpcWriter _writer
300+
301+
def __cinit__(self, CIpcOutputStream stream):
302+
self._writer.private_data = NULL
303+
if not stream.is_valid():
304+
raise ValueError("Can't create writer from released stream")
305+
306+
cdef int code = ArrowIpcWriterInit(&self._writer, &stream._stream)
307+
Error.raise_error_not_ok("ArrowIpcWriterInit()", code)
308+
309+
def is_valid(self):
310+
return self._writer.private_data != NULL
311+
312+
def __dealloc__(self):
313+
if self._writer.private_data != NULL:
314+
ArrowIpcWriterReset(&self._writer)
315+
316+
def release(self):
317+
if self._writer.private_data != NULL:
318+
ArrowIpcWriterReset(&self._writer)
319+
320+
def write_schema(self, CSchema schema):
321+
cdef Error error = Error()
322+
cdef int code = ArrowIpcWriterWriteSchema(&self._writer, schema._ptr, &error.c_error)
323+
error.raise_message_not_ok("ArrowIpcWriterWriteSchema()", code)
324+
325+
def write_array_view(self, CArrayView array_view):
326+
cdef Error error = Error()
327+
cdef int code = ArrowIpcWriterWriteArrayView(&self._writer, array_view._ptr, &error.c_error)
328+
error.raise_message_not_ok("ArrowIpcWriterWriteArrayView()", code)
329+
330+
def write_array_stream(self, uintptr_t stream_addr):
331+
cdef ArrowArrayStream* array_stream = <ArrowArrayStream*>stream_addr
332+
cdef Error error = Error()
333+
cdef int code = ArrowIpcWriterWriteArrayStream(&self._writer, array_stream, &error.c_error)
334+
error.raise_message_not_ok("ArrowIpcWriterWriteArrayStream()", code)
335+
336+
def write_end_of_stream(self):
337+
cdef Error error = Error()
338+
cdef int code = ArrowIpcWriterWriteArrayView(&self._writer, NULL, &error.c_error)
339+
error.raise_message_not_ok("ArrowIpcWriterWriteArrayView()", code)

python/src/nanoarrow/array.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import itertools
1919
from functools import cached_property
20-
from typing import Iterable, Tuple
20+
from typing import Iterable, Tuple, Union
2121

2222
from nanoarrow._array import CArray, CArrayView
2323
from nanoarrow._array_stream import CMaterializedArrayStream
@@ -542,6 +542,29 @@ def __iter__(self):
542542
"to iterate over elements of this Array"
543543
)
544544

545+
def serialize(self, dst=None) -> Union[bytes, None]:
546+
"""Write this Array into dst as zero or more encapsulated IPC messages
547+
548+
Parameters
549+
----------
550+
dst : file-like, optional
551+
If present, a file-like object into which the chunks of this array
552+
should be serialized. If omitted, this will create a ``io.BytesIO()``
553+
and return the serialized result.
554+
"""
555+
from nanoarrow.ipc import StreamWriter
556+
557+
if dst is None:
558+
import io
559+
560+
with io.BytesIO() as dst:
561+
writer = StreamWriter.from_writable(dst)
562+
writer.write_stream(self, write_schema=False)
563+
return dst.getvalue()
564+
else:
565+
writer = StreamWriter.from_writable(dst)
566+
writer.write_stream(self, write_schema=False)
567+
545568
def to_string(self, width_hint=80, items_hint=10) -> str:
546569
cls_name = _repr_utils.make_class_label(self, module="nanoarrow")
547570
len_text = f"[{len(self)}]"

python/src/nanoarrow/array_stream.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -211,17 +211,17 @@ def from_readable(obj):
211211
Examples
212212
--------
213213
>>> import nanoarrow as na
214-
>>> from nanoarrow.ipc import Stream
215-
>>> with na.ArrayStream.from_readable(Stream.example_bytes()) as stream:
214+
>>> from nanoarrow.ipc import InputStream
215+
>>> with na.ArrayStream.from_readable(InputStream.example_bytes()) as stream:
216216
... stream.read_all()
217217
nanoarrow.Array<non-nullable struct<some_col: int32>>[3]
218218
{'some_col': 1}
219219
{'some_col': 2}
220220
{'some_col': 3}
221221
"""
222-
from nanoarrow.ipc import Stream
222+
from nanoarrow.ipc import InputStream
223223

224-
with Stream.from_readable(obj) as ipc_stream:
224+
with InputStream.from_readable(obj) as ipc_stream:
225225
return ArrayStream(ipc_stream)
226226

227227
@staticmethod
@@ -233,11 +233,11 @@ def from_path(obj, *args, **kwargs):
233233
>>> import tempfile
234234
>>> import os
235235
>>> import nanoarrow as na
236-
>>> from nanoarrow.ipc import Stream
236+
>>> from nanoarrow.ipc import InputStream
237237
>>> with tempfile.TemporaryDirectory() as td:
238238
... path = os.path.join(td, "test.arrows")
239239
... with open(path, "wb") as f:
240-
... nbytes = f.write(Stream.example_bytes())
240+
... nbytes = f.write(InputStream.example_bytes())
241241
...
242242
... with na.ArrayStream.from_path(path) as stream:
243243
... stream.read_all()
@@ -246,9 +246,9 @@ def from_path(obj, *args, **kwargs):
246246
{'some_col': 2}
247247
{'some_col': 3}
248248
"""
249-
from nanoarrow.ipc import Stream
249+
from nanoarrow.ipc import InputStream
250250

251-
with Stream.from_path(obj, *args, **kwargs) as ipc_stream:
251+
with InputStream.from_path(obj, *args, **kwargs) as ipc_stream:
252252
return ArrayStream(ipc_stream)
253253

254254
@staticmethod
@@ -261,11 +261,11 @@ def from_url(obj, *args, **kwargs):
261261
>>> import tempfile
262262
>>> import os
263263
>>> import nanoarrow as na
264-
>>> from nanoarrow.ipc import Stream
264+
>>> from nanoarrow.ipc import InputStream
265265
>>> with tempfile.TemporaryDirectory() as td:
266266
... path = os.path.join(td, "test.arrows")
267267
... with open(path, "wb") as f:
268-
... nbytes = f.write(Stream.example_bytes())
268+
... nbytes = f.write(InputStream.example_bytes())
269269
...
270270
... uri = pathlib.Path(path).as_uri()
271271
... with na.ArrayStream.from_url(uri) as stream:
@@ -275,7 +275,7 @@ def from_url(obj, *args, **kwargs):
275275
{'some_col': 2}
276276
{'some_col': 3}
277277
"""
278-
from nanoarrow.ipc import Stream
278+
from nanoarrow.ipc import InputStream
279279

280-
with Stream.from_url(obj, *args, **kwargs) as ipc_stream:
280+
with InputStream.from_url(obj, *args, **kwargs) as ipc_stream:
281281
return ArrayStream(ipc_stream)

0 commit comments

Comments
 (0)