Skip to content

Commit 396d851

Browse files
authored
feat(r): Add bindings to IPC writer (#608)
This PR adds a basic level of support for IPC writing in the R package. This is basically a thin wrapper around `ArrowIpcWriterWriteStream()` and could be more feature-rich like the Python version (that allows schemas and batches to be written individually). I also added a bit of code to handle interrupts (which should catch interrupts on read and write and wasn't handled before). ``` r library(nanoarrow) tf <- tempfile() nycflights13::flights |> write_nanoarrow(tf) (df <- tf |> read_nanoarrow() |> tibble::as_tibble()) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time #> <int> <int> <int> <int> <int> <dbl> <int> <int> #> 1 2013 1 1 517 515 2 830 819 #> 2 2013 1 1 533 529 4 850 830 #> 3 2013 1 1 542 540 2 923 850 #> 4 2013 1 1 544 545 -1 1004 1022 #> 5 2013 1 1 554 600 -6 812 837 #> 6 2013 1 1 554 558 -4 740 728 #> 7 2013 1 1 555 600 -5 913 854 #> 8 2013 1 1 557 600 -3 709 723 #> 9 2013 1 1 557 600 -3 838 846 #> 10 2013 1 1 558 600 -2 753 745 #> # ℹ 336,766 more rows #> # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, #> # hour <dbl>, minute <dbl>, time_hour <dttm> identical(df, nycflights13::flights) #> [1] TRUE ``` <sup>Created on 2024-09-14 with [reprex v2.1.1](https://reprex.tidyverse.org)</sup>
1 parent e18bd38 commit 396d851

File tree

9 files changed

+331
-21
lines changed

9 files changed

+331
-21
lines changed

r/DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Description: Provides an 'R' interface to the 'nanoarrow' 'C' library and the
1919
License: Apache License (>= 2)
2020
Encoding: UTF-8
2121
Roxygen: list(markdown = TRUE)
22-
RoxygenNote: 7.2.3
22+
RoxygenNote: 7.3.2
2323
URL: https://arrow.apache.org/nanoarrow/latest/r/, https://github.com/apache/arrow-nanoarrow
2424
BugReports: https://github.com/apache/arrow-nanoarrow/issues
2525
Suggests:

r/NAMESPACE

+3
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,8 @@ S3method(str,nanoarrow_array_stream)
127127
S3method(str,nanoarrow_buffer)
128128
S3method(str,nanoarrow_schema)
129129
S3method(str,nanoarrow_vctr)
130+
S3method(write_nanoarrow,character)
131+
S3method(write_nanoarrow,connection)
130132
export(array_stream_set_finalizer)
131133
export(as_nanoarrow_array)
132134
export(as_nanoarrow_array_extension)
@@ -210,6 +212,7 @@ export(read_nanoarrow)
210212
export(register_nanoarrow_extension)
211213
export(resolve_nanoarrow_extension)
212214
export(unregister_nanoarrow_extension)
215+
export(write_nanoarrow)
213216
importFrom(utils,getFromNamespace)
214217
importFrom(utils,str)
215218
useDynLib(nanoarrow, .registration = TRUE)

r/R/ipc.R

+66-6
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,17 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
#' Read serialized streams of Arrow data
18+
#' Read/write serialized streams of Arrow data
1919
#'
20-
#' Reads connections, file paths, URLs, or raw vectors of serialized Arrow
21-
#' data. Arrow documentation typically refers to this format as "Arrow IPC",
22-
#' since its origin was as a means to transmit tables between processes
20+
#' Reads/writes connections, file paths, URLs, or raw vectors from/to serialized
21+
#' Arrow data. Arrow documentation typically refers to this format as "Arrow
22+
#' IPC", since its origin was as a means to transmit tables between processes
2323
#' (e.g., multiple R sessions). This format can also be written to and read
2424
#' from files or URLs and is essentially a high performance equivalent of
2525
#' a CSV file that does a better job maintaining types.
2626
#'
27-
#' The nanoarrow package does not currently have the ability to write serialized
28-
#' IPC data: use [arrow::write_ipc_stream()] to write data from R, or use
27+
#' The nanoarrow package implements an IPC writer; however, you can also
28+
#' use [arrow::write_ipc_stream()] to write data from R, or use
2929
#' the equivalent writer from another Arrow implementation in Python, C++,
3030
#' Rust, JavaScript, Julia, C#, and beyond.
3131
#'
@@ -35,6 +35,8 @@
3535
#' @param x A `raw()` vector, connection, or file path from which to read
3636
#' binary data. Common extensions indicating compression (.gz, .bz2, .zip)
3737
#' are automatically uncompressed.
38+
#' @param data An object to write as an Arrow IPC stream, converted using
39+
#' [as_nanoarrow_array_stream()]. Notably, this includes a [data.frame()].
3840
#' @param lazy By default, `read_nanoarrow()` will read and discard a copy of
3941
#' the reader's schema to ensure that invalid streams are discovered as
4042
#' soon as possible. Use `lazy = TRUE` to defer this check until the reader
@@ -107,6 +109,42 @@ read_nanoarrow.connection <- function(x, ..., lazy = FALSE) {
107109
check_stream_if_requested(reader, lazy)
108110
}
109111

112+
#' @rdname read_nanoarrow
113+
#' @export
114+
write_nanoarrow <- function(data, x, ...) {
115+
UseMethod("write_nanoarrow", x)
116+
}
117+
118+
#' @export
119+
write_nanoarrow.connection <- function(data, x, ...) {
120+
if (!isOpen(x)) {
121+
open(x, "wb")
122+
on.exit(close(x))
123+
}
124+
125+
writer <- .Call(nanoarrow_c_ipc_writer_connection, x)
126+
stream <- as_nanoarrow_array_stream(data)
127+
on.exit(nanoarrow_pointer_release(stream), add = TRUE)
128+
129+
.Call(nanoarrow_c_ipc_writer_write_stream, writer, stream)
130+
invisible(data)
131+
}
132+
133+
#' @export
134+
write_nanoarrow.character <- function(data, x, ...) {
135+
if (length(x) != 1) {
136+
stop(sprintf("Can't interpret character(%d) as file path", length(x)))
137+
}
138+
139+
con_type <- guess_connection_type(x)
140+
if (con_type == "unz") {
141+
stop("zip compression not supported for write_nanoarrow()")
142+
}
143+
144+
con <- do.call(con_type, list(x))
145+
write_nanoarrow(data, con)
146+
}
147+
110148
#' @rdname read_nanoarrow
111149
#' @export
112150
example_ipc_stream <- function() {
@@ -205,3 +243,25 @@ guess_zip_filename <- function(x) {
205243

206244
files
207245
}
246+
247+
# The C-level R_tryCatch() does not provide for handling interrupts (or
248+
# I couldn't figure out how to make it work), so instead we provide wrappers
249+
# around readBin() and writeBin() that convert interrupt conditions to errors
250+
# (which the C code does know how to handle).
251+
read_bin_wrapper <- function(con, what, n) {
252+
withCallingHandlers(
253+
readBin(con, what, n),
254+
interrupt = function(e) {
255+
stop("user interrupt")
256+
}
257+
)
258+
}
259+
260+
write_bin_wrapper <- function(object, con) {
261+
withCallingHandlers(
262+
writeBin(object, con),
263+
interrupt = function(e) {
264+
stop("user interrupt")
265+
}
266+
)
267+
}

r/man/read_nanoarrow.Rd

+12-6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/init.c

+5
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ extern SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr);
5858
extern SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp);
5959
extern SEXP nanoarrow_c_ipc_array_reader_buffer(SEXP buffer_xptr);
6060
extern SEXP nanoarrow_c_ipc_array_reader_connection(SEXP con);
61+
extern SEXP nanoarrow_c_ipc_writer_connection(SEXP con);
62+
extern SEXP nanoarrow_c_ipc_writer_write_stream(SEXP writer_xptr, SEXP array_stream_xptr);
6163
extern SEXP nanoarrow_c_allocate_schema(void);
6264
extern SEXP nanoarrow_c_allocate_array(void);
6365
extern SEXP nanoarrow_c_allocate_array_stream(void);
@@ -136,6 +138,9 @@ static const R_CallMethodDef CallEntries[] = {
136138
1},
137139
{"nanoarrow_c_ipc_array_reader_connection",
138140
(DL_FUNC)&nanoarrow_c_ipc_array_reader_connection, 1},
141+
{"nanoarrow_c_ipc_writer_connection", (DL_FUNC)&nanoarrow_c_ipc_writer_connection, 1},
142+
{"nanoarrow_c_ipc_writer_write_stream", (DL_FUNC)&nanoarrow_c_ipc_writer_write_stream,
143+
2},
139144
{"nanoarrow_c_allocate_schema", (DL_FUNC)&nanoarrow_c_allocate_schema, 0},
140145
{"nanoarrow_c_allocate_array", (DL_FUNC)&nanoarrow_c_allocate_array, 0},
141146
{"nanoarrow_c_allocate_array_stream", (DL_FUNC)&nanoarrow_c_allocate_array_stream, 0},

r/src/ipc.c

+145-4
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
#include <stdint.h>
1819
#define R_NO_REMAP
1920
#include <R.h>
2021
#include <Rinternals.h>
@@ -48,6 +49,50 @@ static SEXP input_stream_owning_xptr(void) {
4849
return input_stream_xptr;
4950
}
5051

52+
static void finalize_output_stream_xptr(SEXP output_stream_xptr) {
53+
struct ArrowIpcOutputStream* output_stream =
54+
(struct ArrowIpcOutputStream*)R_ExternalPtrAddr(output_stream_xptr);
55+
if (output_stream != NULL && output_stream->release != NULL) {
56+
output_stream->release(output_stream);
57+
}
58+
59+
if (output_stream != NULL) {
60+
ArrowFree(output_stream);
61+
}
62+
}
63+
64+
static SEXP output_stream_owning_xptr(void) {
65+
struct ArrowIpcOutputStream* output_stream =
66+
(struct ArrowIpcOutputStream*)ArrowMalloc(sizeof(struct ArrowIpcOutputStream));
67+
output_stream->release = NULL;
68+
SEXP output_stream_xptr =
69+
PROTECT(R_MakeExternalPtr(output_stream, R_NilValue, R_NilValue));
70+
R_RegisterCFinalizer(output_stream_xptr, &finalize_output_stream_xptr);
71+
UNPROTECT(1);
72+
return output_stream_xptr;
73+
}
74+
75+
static void finalize_writer_xptr(SEXP writer_xptr) {
76+
struct ArrowIpcWriter* writer = (struct ArrowIpcWriter*)R_ExternalPtrAddr(writer_xptr);
77+
if (writer != NULL && writer->private_data != NULL) {
78+
ArrowIpcWriterReset(writer);
79+
}
80+
81+
if (writer != NULL) {
82+
ArrowFree(writer);
83+
}
84+
}
85+
86+
static SEXP writer_owning_xptr(void) {
87+
struct ArrowIpcWriter* writer =
88+
(struct ArrowIpcWriter*)ArrowMalloc(sizeof(struct ArrowIpcWriter));
89+
writer->private_data = NULL;
90+
SEXP writer_xptr = PROTECT(R_MakeExternalPtr(writer, R_NilValue, R_NilValue));
91+
R_RegisterCFinalizer(writer_xptr, &finalize_writer_xptr);
92+
UNPROTECT(1);
93+
return writer_xptr;
94+
}
95+
5196
SEXP nanoarrow_c_ipc_array_reader_buffer(SEXP buffer_xptr) {
5297
struct ArrowBuffer* buffer = buffer_from_xptr(buffer_xptr);
5398

@@ -82,7 +127,7 @@ struct ConnectionInputStreamHandler {
82127
int return_code;
83128
};
84129

85-
static SEXP handle_readbin_error(SEXP cond, void* hdata) {
130+
static SEXP handle_readbin_writebin_error(SEXP cond, void* hdata) {
86131
struct ConnectionInputStreamHandler* data = (struct ConnectionInputStreamHandler*)hdata;
87132

88133
SEXP fun = PROTECT(Rf_install("conditionMessage"));
@@ -103,7 +148,7 @@ static SEXP call_readbin(void* hdata) {
103148
SEXP n = PROTECT(Rf_ScalarReal((double)data->buf_size_bytes));
104149
SEXP call = PROTECT(Rf_lang4(nanoarrow_sym_readbin, data->con, nanoarrow_ptype_raw, n));
105150

106-
SEXP result = PROTECT(Rf_eval(call, R_BaseEnv));
151+
SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
107152
R_xlen_t bytes_read = Rf_xlength(result);
108153
memcpy(data->buf, RAW(result), bytes_read);
109154
*(data->size_read_out) = bytes_read;
@@ -112,6 +157,36 @@ static SEXP call_readbin(void* hdata) {
112157
return R_NilValue;
113158
}
114159

160+
static SEXP call_writebin(void* hdata) {
161+
struct ConnectionInputStreamHandler* data = (struct ConnectionInputStreamHandler*)hdata;
162+
163+
// Write 16MB chunks. This a balance between being small enough not to
164+
// copy too much of the source unnecessarily and big enough to avoid
165+
// unnecessary R evaluation overhead.
166+
int64_t chunk_buffer_size = 16777216;
167+
SEXP chunk_buffer = PROTECT(Rf_allocVector(RAWSXP, chunk_buffer_size));
168+
SEXP call = PROTECT(Rf_lang3(nanoarrow_sym_writebin, chunk_buffer, data->con));
169+
while (data->buf_size_bytes > chunk_buffer_size) {
170+
memcpy(RAW(chunk_buffer), data->buf, chunk_buffer_size);
171+
Rf_eval(call, nanoarrow_ns_pkg);
172+
data->buf_size_bytes -= chunk_buffer_size;
173+
data->buf += chunk_buffer_size;
174+
}
175+
176+
UNPROTECT(2);
177+
178+
// Write remaining bytes
179+
if (data->buf_size_bytes > 0) {
180+
chunk_buffer = PROTECT(Rf_allocVector(RAWSXP, data->buf_size_bytes));
181+
call = PROTECT(Rf_lang3(nanoarrow_sym_writebin, chunk_buffer, data->con));
182+
memcpy(RAW(chunk_buffer), data->buf, data->buf_size_bytes);
183+
Rf_eval(call, nanoarrow_ns_pkg);
184+
UNPROTECT(2);
185+
}
186+
187+
return R_NilValue;
188+
}
189+
115190
static ArrowErrorCode read_con_input_stream(struct ArrowIpcInputStream* stream,
116191
uint8_t* buf, int64_t buf_size_bytes,
117192
int64_t* size_read_out,
@@ -129,14 +204,43 @@ static ArrowErrorCode read_con_input_stream(struct ArrowIpcInputStream* stream,
129204
data.error = error;
130205
data.return_code = NANOARROW_OK;
131206

132-
R_tryCatchError(&call_readbin, &data, &handle_readbin_error, &data);
207+
R_tryCatchError(&call_readbin, &data, &handle_readbin_writebin_error, &data);
208+
return data.return_code;
209+
}
210+
211+
static ArrowErrorCode write_con_output_stream(struct ArrowIpcOutputStream* stream,
212+
const void* buf, int64_t buf_size_bytes,
213+
int64_t* size_write_out,
214+
struct ArrowError* error) {
215+
if (!nanoarrow_is_main_thread()) {
216+
ArrowErrorSet(error, "Can't read from R connection on a non-R thread");
217+
return EIO;
218+
}
219+
220+
struct ConnectionInputStreamHandler data;
221+
data.con = (SEXP)stream->private_data;
222+
data.buf = (void*)buf;
223+
data.buf_size_bytes = buf_size_bytes;
224+
data.size_read_out = NULL;
225+
data.error = error;
226+
data.return_code = NANOARROW_OK;
227+
228+
R_tryCatchError(&call_writebin, &data, &handle_readbin_writebin_error, &data);
229+
230+
// This implementation always blocks until all bytes have been written
231+
*size_write_out = buf_size_bytes;
232+
133233
return data.return_code;
134234
}
135235

136236
static void release_con_input_stream(struct ArrowIpcInputStream* stream) {
137237
nanoarrow_release_sexp((SEXP)stream->private_data);
138238
}
139239

240+
static void release_con_output_stream(struct ArrowIpcOutputStream* stream) {
241+
nanoarrow_release_sexp((SEXP)stream->private_data);
242+
}
243+
140244
SEXP nanoarrow_c_ipc_array_reader_connection(SEXP con) {
141245
SEXP array_stream_xptr = PROTECT(nanoarrow_array_stream_owning_xptr());
142246
struct ArrowArrayStream* array_stream =
@@ -153,9 +257,46 @@ SEXP nanoarrow_c_ipc_array_reader_connection(SEXP con) {
153257

154258
int code = ArrowIpcArrayStreamReaderInit(array_stream, input_stream, NULL);
155259
if (code != NANOARROW_OK) {
156-
Rf_error("ArrowIpcArrayStreamReaderInit() failed");
260+
Rf_error("ArrowIpcArrayStreamReaderInit() failed with errno %d", code);
157261
}
158262

159263
UNPROTECT(2);
160264
return array_stream_xptr;
161265
}
266+
267+
SEXP nanoarrow_c_ipc_writer_connection(SEXP con) {
268+
SEXP output_stream_xptr = PROTECT(output_stream_owning_xptr());
269+
struct ArrowIpcOutputStream* output_stream =
270+
(struct ArrowIpcOutputStream*)R_ExternalPtrAddr(output_stream_xptr);
271+
272+
output_stream->write = &write_con_output_stream;
273+
output_stream->release = &release_con_output_stream;
274+
output_stream->private_data = (SEXP)con;
275+
nanoarrow_preserve_sexp(con);
276+
277+
SEXP writer_xptr = PROTECT(writer_owning_xptr());
278+
struct ArrowIpcWriter* writer = (struct ArrowIpcWriter*)R_ExternalPtrAddr(writer_xptr);
279+
280+
int code = ArrowIpcWriterInit(writer, output_stream);
281+
if (code != NANOARROW_OK) {
282+
Rf_error("ArrowIpcWriterInit() failed with errno %d", code);
283+
}
284+
285+
UNPROTECT(2);
286+
return writer_xptr;
287+
}
288+
289+
SEXP nanoarrow_c_ipc_writer_write_stream(SEXP writer_xptr, SEXP array_stream_xptr) {
290+
struct ArrowIpcWriter* writer = (struct ArrowIpcWriter*)R_ExternalPtrAddr(writer_xptr);
291+
struct ArrowArrayStream* array_stream =
292+
nanoarrow_array_stream_from_xptr(array_stream_xptr);
293+
294+
struct ArrowError error;
295+
ArrowErrorInit(&error);
296+
int code = ArrowIpcWriterWriteArrayStream(writer, array_stream, &error);
297+
if (code != NANOARROW_OK) {
298+
Rf_error("ArrowIpcWriterWriteArrayStream() failed: %s", error.message);
299+
}
300+
301+
return R_NilValue;
302+
}

0 commit comments

Comments
 (0)