Skip to content

Commit 5b98b3d

Browse files
paleolimbotamoeba
andauthored
feat(r): Implement string view support in R bindings (#636)
This PR adds support for string view and binary view types to the R bindings. As a side effect of this, conversion of character vectors to Arrow types is now simpler (just goes through nanoarrow C's array builder) and supports more types (e.g., the arrow package is no longer required to create large_string, large_binary, or fixed_size_binary). ``` r library(nanoarrow) long_strings <- rep(strrep(letters, 100), 100) (array <- as_nanoarrow_array(long_strings, schema = na_string_view())) #> <nanoarrow_array string_view[2600]> #> $ length : int 2600 #> $ null_count: int 0 #> $ offset : int 0 #> $ buffers :List of 11 #> ..$ :<nanoarrow_buffer validity<bool>[0][0 b]> `` #> ..$ :<nanoarrow_buffer unknown<string_view>[2600][41600 b]>` #> ..$ :<nanoarrow_buffer data<string>[32700 b]> `aaaaaaaaaaaaaaaaaaaaaaaaaaa...` #> ..$ :<nanoarrow_buffer data<string>[32700 b]> `ppppppppppppppppppppppppppp...` #> ..$ :<nanoarrow_buffer data<string>[32700 b]> `eeeeeeeeeeeeeeeeeeeeeeeeeee...` #> ..$ :<nanoarrow_buffer data<string>[32700 b]> `ttttttttttttttttttttttttttt...` #> ..$ :<nanoarrow_buffer data<string>[32700 b]> `iiiiiiiiiiiiiiiiiiiiiiiiiii...` #> ..$ :<nanoarrow_buffer data<string>[32700 b]> `xxxxxxxxxxxxxxxxxxxxxxxxxxx...` #> ..$ :<nanoarrow_buffer data<string>[32700 b]> `mmmmmmmmmmmmmmmmmmmmmmmmmmm...` #> ..$ :<nanoarrow_buffer data<string>[31100 b]> `bbbbbbbbbbbbbbbbbbbbbbbbbbb...` #> ..$ :<nanoarrow_buffer data<int64>[8][64 b]> `32700 32700 32700 32700 3270...` #> $ dictionary: NULL #> $ children : list() identical(convert_array(array), long_strings) #> [1] TRUE ``` <sup>Created on 2024-09-27 with [reprex v2.1.1](https://reprex.tidyverse.org)</sup> --------- Co-authored-by: Bryce Mecum <[email protected]>
1 parent d6ef480 commit 5b98b3d

16 files changed

+318
-133
lines changed

r/NAMESPACE

+2
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ export(infer_nanoarrow_ptype)
147147
export(infer_nanoarrow_ptype_extension)
148148
export(infer_nanoarrow_schema)
149149
export(na_binary)
150+
export(na_binary_view)
150151
export(na_bool)
151152
export(na_date32)
152153
export(na_date64)
@@ -176,6 +177,7 @@ export(na_map)
176177
export(na_na)
177178
export(na_sparse_union)
178179
export(na_string)
180+
export(na_string_view)
179181
export(na_struct)
180182
export(na_time32)
181183
export(na_time64)

r/R/buffer.R

+17-10
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ as_nanoarrow_buffer.default <- function(x, ...) {
6464

6565
#' @importFrom utils str
6666
#' @export
67-
str.nanoarrow_buffer <- function(object, ..., db = F, indent.str = "",
67+
str.nanoarrow_buffer <- function(object, ..., indent.str = "",
6868
width = getOption("width")) {
6969
formatted <- format(object)
7070
cat(formatted)
@@ -117,8 +117,10 @@ print.nanoarrow_buffer <- function(x, ...) {
117117
}
118118

119119
#' @export
120+
120121
format.nanoarrow_buffer <- function(x, ...) {
121122
info <- nanoarrow_buffer_info(x)
123+
is_null <- identical(nanoarrow_pointer_addr_chr(info$data), "0")
122124
if (info$data_type == "unknown") {
123125
len <- ""
124126
} else if (info$element_size_bits == 0 || info$data_type %in% c("binary", "string")) {
@@ -128,14 +130,17 @@ format.nanoarrow_buffer <- function(x, ...) {
128130
len <- sprintf("[%s][%s b]", logical_length, info$size_bytes)
129131
}
130132

131-
132-
sprintf(
133-
"<%s %s<%s>%s>",
134-
class(x)[1],
135-
info$type,
136-
info$data_type,
137-
len
138-
)
133+
if (is_null) {
134+
sprintf("<%s %s<%s>[null]", class(x)[1], info$type, info$data_type)
135+
} else {
136+
sprintf(
137+
"<%s %s<%s>%s>",
138+
class(x)[1],
139+
info$type,
140+
info$data_type,
141+
len
142+
)
143+
}
139144
}
140145

141146
#' Create and modify nanoarrow buffers
@@ -209,7 +214,7 @@ as_nanoarrow_array.nanoarrow_buffer <- function(x, ..., schema = NULL) {
209214
buffers = list(NULL, offsets, x)
210215
)
211216
)
212-
} else if(data_type %in% c("string", "binary")) {
217+
} else if (data_type %in% c("string", "binary")) {
213218
array <- nanoarrow_array_init(na_type(paste0("large_", data_type)))
214219
offsets <- as_nanoarrow_array(c(0, logical_length), schema = na_int64())$buffers[[2]]
215220
nanoarrow_array_modify(
@@ -220,6 +225,8 @@ as_nanoarrow_array.nanoarrow_buffer <- function(x, ..., schema = NULL) {
220225
buffers = list(NULL, offsets, x)
221226
)
222227
)
228+
} else if (data_type %in% c("string_view", "binary_view")) {
229+
stop("Can't convert buffer of type string_view or binary_view to array")
223230
} else {
224231
array <- nanoarrow_array_init(na_type(data_type))
225232
nanoarrow_array_modify(

r/R/infer-ptype.R

+2-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ infer_ptype_other <- function(schema) {
7676
parsed$type,
7777
"na" = vctrs::unspecified(),
7878
"binary" = ,
79-
"large_binary" = new_blob_internal(),
79+
"large_binary" = ,
80+
"binary_view" = new_blob_internal(),
8081
"date32" = structure(numeric(), class = "Date"),
8182
"time32" = ,
8283
"time64" = hms::hms(),

r/R/type.R

+16-1
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,12 @@ na_large_string <- function(nullable = TRUE) {
186186
.Call(nanoarrow_c_schema_init, NANOARROW_TYPE$LARGE_STRING, isTRUE(nullable))
187187
}
188188

189+
#' @rdname na_type
190+
#' @export
191+
na_string_view <- function(nullable = TRUE) {
192+
.Call(nanoarrow_c_schema_init, NANOARROW_TYPE$STRING_VIEW, isTRUE(nullable))
193+
}
194+
189195
#' @rdname na_type
190196
#' @export
191197
na_binary <- function(nullable = TRUE) {
@@ -209,6 +215,12 @@ na_fixed_size_binary <- function(byte_width, nullable = TRUE) {
209215
)
210216
}
211217

218+
#' @rdname na_type
219+
#' @export
220+
na_binary_view <- function(nullable = TRUE) {
221+
.Call(nanoarrow_c_schema_init, NANOARROW_TYPE$BINARY_VIEW, isTRUE(nullable))
222+
}
223+
212224
#' @rdname na_type
213225
#' @export
214226
na_date32 <- function(nullable = TRUE) {
@@ -460,7 +472,10 @@ NANOARROW_TYPE <- list(
460472
LARGE_STRING = 35L,
461473
LARGE_BINARY = 36L,
462474
LARGE_LIST = 37L,
463-
INTERVAL_MONTH_DAY_NANO = 38L
475+
INTERVAL_MONTH_DAY_NANO = 38L,
476+
RUN_END_ENCODED = 39L,
477+
BINARY_VIEW = 40L,
478+
STRING_VIEW = 41L
464479
)
465480

466481
ARROW_FLAG <- list(

r/man/na_type.Rd

+6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/array.c

+35-5
Original file line numberDiff line numberDiff line change
@@ -370,13 +370,43 @@ static SEXP borrow_buffer(struct ArrowArrayView* array_view, int64_t i, SEXP she
370370
SEXP buffer_class = PROTECT(Rf_allocVector(STRSXP, 2));
371371
SET_STRING_ELT(buffer_class, 1, Rf_mkChar("nanoarrow_buffer"));
372372

373+
struct ArrowBufferView view;
374+
enum ArrowBufferType buffer_type;
375+
enum ArrowType data_type;
376+
int64_t element_size_bits;
377+
if ((array_view->storage_type == NANOARROW_TYPE_STRING_VIEW ||
378+
array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) &&
379+
i >= NANOARROW_BINARY_VIEW_FIXED_BUFFERS) {
380+
view.data.data = array_view->array->buffers[i];
381+
382+
if (i == (array_view->n_variadic_buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS)) {
383+
view.size_bytes = array_view->n_variadic_buffers * sizeof(int64_t);
384+
buffer_type = NANOARROW_BUFFER_TYPE_DATA;
385+
data_type = NANOARROW_TYPE_INT64;
386+
element_size_bits = 64;
387+
} else {
388+
view.size_bytes =
389+
array_view->variadic_buffer_sizes[i - NANOARROW_BINARY_VIEW_FIXED_BUFFERS];
390+
buffer_type = NANOARROW_BUFFER_TYPE_DATA;
391+
392+
if (array_view->storage_type == NANOARROW_TYPE_STRING_VIEW) {
393+
data_type = NANOARROW_TYPE_STRING;
394+
} else {
395+
data_type = NANOARROW_TYPE_BINARY;
396+
}
397+
element_size_bits = 0;
398+
}
399+
} else {
400+
view = array_view->buffer_views[i];
401+
buffer_type = array_view->layout.buffer_type[i];
402+
data_type = array_view->layout.buffer_data_type[i];
403+
element_size_bits = array_view->layout.element_size_bits[i];
404+
}
405+
373406
SEXP buffer_xptr =
374-
PROTECT(buffer_borrowed_xptr(array_view->buffer_views[i].data.data,
375-
array_view->buffer_views[i].size_bytes, shelter));
407+
PROTECT(buffer_borrowed_xptr(view.data.data, view.size_bytes, shelter));
376408

377-
buffer_borrowed_xptr_set_type(buffer_xptr, array_view->layout.buffer_type[i],
378-
array_view->layout.buffer_data_type[i],
379-
array_view->layout.element_size_bits[i]);
409+
buffer_borrowed_xptr_set_type(buffer_xptr, buffer_type, data_type, element_size_bits);
380410
UNPROTECT(2);
381411
return buffer_xptr;
382412
}

0 commit comments

Comments
 (0)