Skip to content

Commit 1eaa8d5

Browse files
authored
feat(r): Support matrix objects as fixed-size-list arrays (#692)
Still needs some testing on the stream case, and is unfortunately not very zero copy; however, gets the job done (and I think fixes some cases where we would have otherwise silently handled a matrix as the storage type). Inspired by #691! ``` r library(nanoarrow) df <- data.frame(x = 1:10) df$matrix_col <- matrix(letters[1:20], ncol = 2, byrow = TRUE) array <- as_nanoarrow_array(df) # Default comes back as list_of(character()) convert_array(array) |> tibble::as_tibble() #> # A tibble: 10 × 2 #> x matrix_col #> <int> <list<chr>> #> 1 1 [2] #> 2 2 [2] #> 3 3 [2] #> 4 4 [2] #> 5 5 [2] #> 6 6 [2] #> 7 7 [2] #> 8 8 [2] #> 9 9 [2] #> 10 10 [2] # But can specify matrix convert_array( array, tibble::tibble(x = integer(), matrix_col = matrix(character(), ncol = 2)) ) #> # A tibble: 10 × 2 #> x matrix_col[,1] [,2] #> <int> <chr> <chr> #> 1 1 a b #> 2 2 c d #> 3 3 e f #> 4 4 g h #> 5 5 i j #> 6 6 k l #> 7 7 m n #> 8 8 o p #> 9 9 q r #> 10 10 s t ``` <sup>Created on 2024-12-12 with [reprex v2.1.1](https://reprex.tidyverse.org)</sup>
1 parent 7b8a7c8 commit 1eaa8d5

15 files changed

+329
-28
lines changed

r/NAMESPACE

+2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ S3method(as_nanoarrow_array,difftime)
3636
S3method(as_nanoarrow_array,factor)
3737
S3method(as_nanoarrow_array,integer64)
3838
S3method(as_nanoarrow_array,list)
39+
S3method(as_nanoarrow_array,matrix)
3940
S3method(as_nanoarrow_array,nanoarrow_array)
4041
S3method(as_nanoarrow_array,nanoarrow_buffer)
4142
S3method(as_nanoarrow_array,vctrs_unspecified)
@@ -100,6 +101,7 @@ S3method(infer_nanoarrow_schema,integer)
100101
S3method(infer_nanoarrow_schema,integer64)
101102
S3method(infer_nanoarrow_schema,list)
102103
S3method(infer_nanoarrow_schema,logical)
104+
S3method(infer_nanoarrow_schema,matrix)
103105
S3method(infer_nanoarrow_schema,nanoarrow_array)
104106
S3method(infer_nanoarrow_schema,nanoarrow_array_stream)
105107
S3method(infer_nanoarrow_schema,nanoarrow_vctr)

r/R/as-array.R

+36
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,42 @@ as_nanoarrow_array.blob <- function(x, ..., schema = NULL) {
194194
as_nanoarrow_array(unclass(x), schema = schema)
195195
}
196196

197+
#' @export
198+
as_nanoarrow_array.matrix <- function(x, ..., schema = NULL) {
199+
if (is.null(schema)) {
200+
schema <- infer_nanoarrow_schema(x)
201+
} else {
202+
schema <- as_nanoarrow_schema(schema)
203+
}
204+
205+
expected_format <- paste0("+w:", ncol(x))
206+
if (expected_format != schema$format) {
207+
stop(
208+
sprintf(
209+
"Expected schema for matrix with fixed-size list of %d elements but got %s",
210+
ncol(x),
211+
nanoarrow_schema_formatted(schema)
212+
)
213+
)
214+
}
215+
216+
# Raw unclass() doesn't work for matrix()
217+
row_major_data <- t(x)
218+
attributes(row_major_data) <- NULL
219+
220+
child_array <- as_nanoarrow_array(row_major_data, schema = schema$children[[1]])
221+
array <- nanoarrow_array_init(schema)
222+
nanoarrow_array_modify(
223+
array,
224+
list(
225+
length = nrow(x),
226+
null_count = 0,
227+
buffers = list(NULL),
228+
children = list(child_array)
229+
)
230+
)
231+
}
232+
197233
#' @export
198234
as_nanoarrow_array.data.frame <- function(x, ..., schema = NULL) {
199235
# We need to override this to prevent the list implementation from handling it

r/R/convert-array.R

+2
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@
6868
#' be converted to [blob::blob()].
6969
#' - [vctrs::list_of()]: List, large list, and fixed-size list types can be
7070
#' converted to [vctrs::list_of()].
71+
#' - [matrix()]: Fixed-size list types can be converted to
72+
#' `matrix(ptype, ncol = fixed_size)`.
7173
#' - [data.frame()]: Struct types can be converted to [data.frame()].
7274
#' - [vctrs::unspecified()]: Any type can be converted to [vctrs::unspecified()];
7375
#' however, a warning will be raised if any non-null values are encountered.

r/R/schema.R

+8
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,14 @@ infer_nanoarrow_schema.vctrs_unspecified <- function(x, ...) {
153153
na_na()
154154
}
155155

156+
#' @export
157+
infer_nanoarrow_schema.matrix <- function(x, ...) {
158+
na_fixed_size_list(
159+
infer_nanoarrow_schema(unclass(x[integer(0)])),
160+
list_size = ncol(x)
161+
)
162+
}
163+
156164
#' @export
157165
infer_nanoarrow_schema.vctrs_list_of <- function(x, ...) {
158166
child_type <- infer_nanoarrow_schema(attr(x, "ptype"))

r/man/convert_array.Rd

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/convert.c

+30-15
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
static R_xlen_t nanoarrow_vec_size(SEXP vec_sexp, struct PTypeView* ptype_view) {
3030
if (ptype_view->vector_type == VECTOR_TYPE_DATA_FRAME) {
3131
return nanoarrow_data_frame_size(vec_sexp);
32+
} else if (Rf_isMatrix(vec_sexp)) {
33+
return Rf_nrows(vec_sexp);
3234
} else {
3335
return Rf_xlength(vec_sexp);
3436
}
@@ -149,12 +151,7 @@ static void set_converter_data_frame(SEXP converter_xptr, struct RConverter* con
149151
}
150152

151153
static void set_converter_list_of(SEXP converter_xptr, struct RConverter* converter,
152-
SEXP ptype) {
153-
SEXP child_ptype = Rf_getAttrib(ptype, Rf_install("ptype"));
154-
if (child_ptype == R_NilValue) {
155-
Rf_error("Expected attribute 'ptype' for conversion to list_of");
156-
}
157-
154+
SEXP ptype, SEXP child_ptype) {
158155
converter->children = (struct RConverter**)ArrowMalloc(1 * sizeof(struct RConverter*));
159156
if (converter->children == NULL) {
160157
Rf_error("Failed to allocate converter children array");
@@ -230,15 +227,25 @@ SEXP nanoarrow_converter_from_ptype(SEXP ptype) {
230227
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
231228
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
232229

233-
if (Rf_isObject(ptype)) {
230+
if (Rf_isMatrix(ptype)) {
231+
converter->ptype_view.vector_type = VECTOR_TYPE_MATRIX;
232+
SEXP child_ptype = PROTECT(Rf_allocVector(TYPEOF(ptype), 0));
233+
set_converter_list_of(converter_xptr, converter, ptype, child_ptype);
234+
UNPROTECT(1);
235+
} else if (Rf_isObject(ptype)) {
234236
if (nanoarrow_ptype_is_data_frame(ptype)) {
235237
converter->ptype_view.vector_type = VECTOR_TYPE_DATA_FRAME;
236238
set_converter_data_frame(converter_xptr, converter, ptype);
237239
} else if (Rf_inherits(ptype, "blob")) {
238240
converter->ptype_view.vector_type = VECTOR_TYPE_BLOB;
239241
} else if (Rf_inherits(ptype, "vctrs_list_of")) {
240242
converter->ptype_view.vector_type = VECTOR_TYPE_LIST_OF;
241-
set_converter_list_of(converter_xptr, converter, ptype);
243+
SEXP child_ptype = Rf_getAttrib(ptype, Rf_install("ptype"));
244+
if (child_ptype == R_NilValue) {
245+
Rf_error("Expected attribute 'ptype' for conversion to list_of");
246+
}
247+
248+
set_converter_list_of(converter_xptr, converter, ptype, child_ptype);
242249
} else if (Rf_inherits(ptype, "vctrs_unspecified")) {
243250
converter->ptype_view.vector_type = VECTOR_TYPE_UNSPECIFIED;
244251
} else if (Rf_inherits(ptype, "Date")) {
@@ -300,7 +307,8 @@ int nanoarrow_converter_set_schema(SEXP converter_xptr, SEXP schema_xptr) {
300307
ArrowArrayViewInitFromSchema(&converter->array_view, schema, &converter->error));
301308

302309
if (converter->ptype_view.vector_type == VECTOR_TYPE_LIST_OF ||
303-
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) {
310+
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME ||
311+
converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
304312
set_converter_children_schema(converter_xptr, schema_xptr);
305313
}
306314

@@ -318,7 +326,8 @@ int nanoarrow_converter_set_array(SEXP converter_xptr, SEXP array_xptr) {
318326
converter->src.length = 0;
319327

320328
if (converter->ptype_view.vector_type == VECTOR_TYPE_LIST_OF ||
321-
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) {
329+
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME ||
330+
converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
322331
set_converter_children_array(converter_xptr, array_xptr);
323332
}
324333

@@ -343,17 +352,23 @@ void sync_after_converter_reallocate(SEXP converter_xptr, struct RConverter* con
343352
converter->children[i], VECTOR_ELT(result_sexp, i),
344353
capacity);
345354
}
355+
} else if (converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
356+
// Reserve for the child converter here, which ensures that a matrix column in
357+
// a data.frame() will get allocated properly.
358+
SEXP child_converters = VECTOR_ELT(converter_shelter, 3);
359+
SEXP item_converter_xptr = VECTOR_ELT(child_converters, 0);
360+
nanoarrow_converter_reserve(item_converter_xptr,
361+
capacity * Rf_ncols(converter->ptype_view.ptype));
346362
}
347363
}
348364

349-
int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) {
365+
void nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) {
350366
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
351367
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
352368
SEXP current_result = VECTOR_ELT(converter_shelter, 4);
353369

354370
if (current_result != R_NilValue) {
355-
ArrowErrorSet(&converter->error, "Reallocation in converter is not implemented");
356-
return ENOTSUP;
371+
Rf_error("Reallocation in converter is not implemented");
357372
}
358373

359374
SEXP result_sexp;
@@ -368,7 +383,6 @@ int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) {
368383
sync_after_converter_reallocate(converter_xptr, converter, result_sexp,
369384
additional_size);
370385
UNPROTECT(1);
371-
return NANOARROW_OK;
372386
}
373387

374388
R_xlen_t nanoarrow_converter_materialize_n(SEXP converter_xptr, R_xlen_t n) {
@@ -401,7 +415,7 @@ R_xlen_t nanoarrow_converter_materialize_n(SEXP converter_xptr, R_xlen_t n) {
401415
int nanoarrow_converter_materialize_all(SEXP converter_xptr) {
402416
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
403417
R_xlen_t remaining = converter->array_view.array->length;
404-
NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, remaining));
418+
nanoarrow_converter_reserve(converter_xptr, remaining);
405419
if (nanoarrow_converter_materialize_n(converter_xptr, remaining) != remaining) {
406420
return ERANGE;
407421
} else {
@@ -415,6 +429,7 @@ int nanoarrow_converter_finalize(SEXP converter_xptr) {
415429
SEXP current_result = VECTOR_ELT(converter_shelter, 4);
416430

417431
NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_finalize_result(converter_xptr));
432+
current_result = VECTOR_ELT(converter_shelter, 4);
418433

419434
// Check result size. A future implementation could also shrink the length
420435
// or reallocate a shorter vector.

r/src/convert.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ int nanoarrow_converter_set_array(SEXP converter_xptr, SEXP array_xptr);
4141

4242
// Reserve space in the R vector output for additional elements. In theory
4343
// this could be used to provide growable behaviour; however, this is not
44-
// implemented. Returns an errno code.
45-
int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size);
44+
// implemented.
45+
void nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size);
4646

4747
// Materialize the next n elements into the output. Returns the number of elements
4848
// that were actually materialized which may be less than n.

r/src/convert_array_stream.c

+1-3
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,7 @@ SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp,
9494
nanoarrow_converter_stop(converter_xptr);
9595
}
9696

97-
if (nanoarrow_converter_reserve(converter_xptr, size) != NANOARROW_OK) {
98-
nanoarrow_converter_stop(converter_xptr);
99-
}
97+
nanoarrow_converter_reserve(converter_xptr, size);
10098

10199
int64_t n_batches = 0;
102100
do {

r/src/materialize.c

+83-5
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,12 @@ int nanoarrow_ptype_is_nanoarrow_vctr(SEXP ptype) {
116116
SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) {
117117
SEXP result;
118118

119-
if (Rf_isObject(ptype)) {
119+
if (Rf_isMatrix(ptype)) {
120+
// The actual value is built in the child converter but we can't have
121+
// a NULL here because that confuses the internals into thinking that
122+
// the allocate was never called.
123+
result = PROTECT(Rf_allocVector(TYPEOF(ptype), 0));
124+
} else if (Rf_isObject(ptype)) {
120125
// There may be a more accurate test that more precisely captures the case
121126
// where a user has specified a valid ptype that doesn't work in a preallocate
122127
// + fill conversion.
@@ -301,11 +306,12 @@ static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) {
301306

302307
int nanoarrow_materialize_finalize_result(SEXP converter_xptr) {
303308
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
309+
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
304310
SEXP result = VECTOR_ELT(converter_shelter, 4);
305311

306312
// Materialize never called (e.g., empty stream)
307313
if (result == R_NilValue) {
308-
NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0));
314+
nanoarrow_converter_reserve(converter_xptr, 0);
309315
result = VECTOR_ELT(converter_shelter, 4);
310316
}
311317

@@ -357,6 +363,22 @@ int nanoarrow_materialize_finalize_result(SEXP converter_xptr) {
357363
SET_VECTOR_ELT(result, i, child_result);
358364
UNPROTECT(1);
359365
}
366+
} else if (converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
367+
SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
368+
SEXP item_converter_xptr = VECTOR_ELT(child_converter_xptrs, 0);
369+
NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_finalize_result(item_converter_xptr));
370+
SEXP item_result = PROTECT(nanoarrow_converter_release_result(item_converter_xptr));
371+
372+
SEXP matrix_symbol = PROTECT(Rf_install("matrix"));
373+
SEXP nrow_sexp = PROTECT(
374+
Rf_ScalarInteger(Rf_xlength(item_result) / converter->schema_view.fixed_size));
375+
SEXP ncol_sexp = PROTECT(Rf_ScalarInteger(converter->schema_view.fixed_size));
376+
SEXP byrow_sexp = PROTECT(Rf_ScalarLogical(TRUE));
377+
SEXP matrix_call =
378+
PROTECT(Rf_lang5(matrix_symbol, item_result, nrow_sexp, ncol_sexp, byrow_sexp));
379+
SEXP final_result = PROTECT(Rf_eval(matrix_call, R_BaseNamespace));
380+
SET_VECTOR_ELT(converter_shelter, 4, final_result);
381+
UNPROTECT(7);
360382
}
361383

362384
return NANOARROW_OK;
@@ -496,9 +518,7 @@ static int nanoarrow_materialize_data_frame(struct RConverter* converter,
496518

497519
static int materialize_list_element(struct RConverter* converter, SEXP converter_xptr,
498520
int64_t offset, int64_t length) {
499-
if (nanoarrow_converter_reserve(converter_xptr, length) != NANOARROW_OK) {
500-
nanoarrow_converter_stop(converter_xptr);
501-
}
521+
nanoarrow_converter_reserve(converter_xptr, length);
502522

503523
converter->src.offset = offset;
504524
converter->src.length = length;
@@ -581,6 +601,62 @@ static int nanoarrow_materialize_list_of(struct RConverter* converter,
581601
return NANOARROW_OK;
582602
}
583603

604+
static int nanoarrow_materialize_matrix(struct RConverter* converter,
605+
SEXP converter_xptr) {
606+
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
607+
SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
608+
struct RConverter* child_converter = converter->children[0];
609+
SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, 0);
610+
611+
struct ArrayViewSlice* src = &converter->src;
612+
613+
// Make sure we error for dictionary types
614+
if (src->array_view->array->dictionary != NULL) {
615+
return EINVAL;
616+
}
617+
618+
switch (src->array_view->storage_type) {
619+
case NANOARROW_TYPE_FIXED_SIZE_LIST:
620+
break;
621+
default:
622+
return EINVAL;
623+
}
624+
625+
int64_t raw_src_offset = src->array_view->offset + src->offset;
626+
int64_t list_length = src->array_view->layout.child_size_elements;
627+
int64_t child_length = list_length * src->length;
628+
629+
if (list_length != Rf_ncols(converter->ptype_view.ptype)) {
630+
Rf_error("Can't convert fixed_size_list(list_size=%d) to matrix with %d cols",
631+
(int)list_length, Rf_ncols(converter->ptype_view.ptype));
632+
}
633+
634+
// First, we update the child array offset to account for the parent offset and
635+
// materialize the child array.
636+
child_converter->src.offset += raw_src_offset * list_length;
637+
child_converter->src.length = child_length;
638+
if (nanoarrow_converter_materialize_n(child_converter_xptr, child_length) !=
639+
child_length) {
640+
return EINVAL;
641+
}
642+
643+
// If we have parent nulls, we have to project them into the destination
644+
if (src->array_view->null_count != 0 &&
645+
src->array_view->buffer_views[0].data.data != NULL) {
646+
// Here, dst.offset has already been incremented such that it's at the end
647+
// of the chunk, but we need the original one for fill_vec_with_nulls().
648+
int64_t original_dst_offset = child_converter->dst.offset - child_length;
649+
for (int64_t i = 0; i < src->length; i++) {
650+
if (ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
651+
fill_vec_with_nulls(child_converter->dst.vec_sexp,
652+
original_dst_offset + (i * list_length), list_length);
653+
}
654+
}
655+
}
656+
657+
return NANOARROW_OK;
658+
}
659+
584660
static int nanoarrow_materialize_base(struct RConverter* converter, SEXP converter_xptr) {
585661
struct ArrayViewSlice* src = &converter->src;
586662
struct VectorSlice* dst = &converter->dst;
@@ -614,6 +690,8 @@ static int nanoarrow_materialize_base(struct RConverter* converter, SEXP convert
614690
return nanoarrow_materialize_blob(src, dst, options);
615691
case VECTOR_TYPE_LIST_OF:
616692
return nanoarrow_materialize_list_of(converter, converter_xptr);
693+
case VECTOR_TYPE_MATRIX:
694+
return nanoarrow_materialize_matrix(converter, converter_xptr);
617695
case VECTOR_TYPE_DATA_FRAME:
618696
return nanoarrow_materialize_data_frame(converter, converter_xptr);
619697
default:

r/src/materialize_common.h

+1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ enum VectorType {
4444
VECTOR_TYPE_BLOB,
4545
VECTOR_TYPE_LIST_OF,
4646
VECTOR_TYPE_DATA_FRAME,
47+
VECTOR_TYPE_MATRIX,
4748
VECTOR_TYPE_OTHER
4849
};
4950

0 commit comments

Comments
 (0)