Skip to content

Commit 6f8badb

Browse files
authored
feat: String/Binary View Support (#596)
closes #583
1 parent 44e8eb9 commit 6f8badb

13 files changed

+518
-51
lines changed

python/bootstrap.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,10 @@ def _pxd_header(self):
172172

173173
class NanoarrowPxdGenerator(PxdGenerator):
174174
def _preprocess_content(self, content):
175-
return re.sub(r"NANOARROW_MAX_FIXED_BUFFERS", "3", content)
175+
content = re.sub(r"NANOARROW_MAX_FIXED_BUFFERS", "3", content)
176+
content = re.sub(r"NANOARROW_BINARY_VIEW_INLINE_SIZE", "12", content)
177+
content = re.sub(r"NANOARROW_BINARY_VIEW_PREFIX_SIZE", "4", content)
178+
return content
176179

177180
def _pxd_header(self):
178181
return (

python/src/nanoarrow/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,14 @@
4848
float64,
4949
string,
5050
large_string,
51+
string_view,
5152
list_,
5253
large_list,
5354
fixed_size_list,
5455
dictionary,
5556
binary,
5657
large_binary,
58+
binary_view,
5759
fixed_size_binary,
5860
date32,
5961
date64,
@@ -82,6 +84,7 @@
8284
"TimeUnit",
8385
"Type",
8486
"binary",
87+
"binary_view",
8588
"bool_",
8689
"c_array",
8790
"c_array_from_buffers",
@@ -117,6 +120,7 @@
117120
"nulls_forbid",
118121
"nulls_separate",
119122
"string",
123+
"string_view",
120124
"struct",
121125
"schema",
122126
"time32",

python/src/nanoarrow/_types.pxd

+4
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ cpdef enum CArrowType:
6767
LARGE_BINARY = NANOARROW_TYPE_LARGE_BINARY
6868
LARGE_LIST = NANOARROW_TYPE_LARGE_LIST
6969
INTERVAL_MONTH_DAY_NANO = NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO
70+
RUN_END_ENCODED = NANOARROW_TYPE_RUN_END_ENCODED
71+
BINARY_VIEW = NANOARROW_TYPE_BINARY_VIEW
72+
STRING_VIEW = NANOARROW_TYPE_STRING_VIEW
73+
7074

7175
cdef equal(int type_id1, int type_id2)
7276

python/src/nanoarrow/schema.py

+39
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ class Type(enum.Enum):
7676
LARGE_BINARY = int(_types.LARGE_BINARY)
7777
LARGE_LIST = int(_types.LARGE_LIST)
7878
INTERVAL_MONTH_DAY_NANO = int(_types.INTERVAL_MONTH_DAY_NANO)
79+
RUN_END_ENCODED = int(_types.RUN_END_ENCODED)
80+
BINARY_VIEW = int(_types.BINARY_VIEW)
81+
STRING_VIEW = int(_types.STRING_VIEW)
7982

8083
def __arrow_c_schema__(self):
8184
# This will only work for parameter-free types
@@ -784,6 +787,24 @@ def large_string(nullable: bool = True) -> Schema:
784787
return Schema(Type.LARGE_STRING, nullable=nullable)
785788

786789

790+
def string_view(nullable: bool = True) -> Schema:
791+
"""Create an instance of a string view type.
792+
793+
Parameters
794+
----------
795+
nullable : bool, optional
796+
Use ``False`` to mark this field as non-nullable.
797+
798+
Examples
799+
--------
800+
801+
>>> import nanoarrow as na
802+
>>> na.string_view()
803+
<Schema> string_view
804+
"""
805+
return Schema(Type.STRING_VIEW, nullable=nullable)
806+
807+
787808
def binary(nullable: bool = True) -> Schema:
788809
"""Create an instance of a variable or fixed-width binary type.
789810
@@ -820,6 +841,24 @@ def large_binary(nullable: bool = True) -> Schema:
820841
return Schema(Type.LARGE_BINARY, nullable=nullable)
821842

822843

844+
def binary_view(nullable: bool = True) -> Schema:
845+
"""Create an instance of a binary view type.
846+
847+
Parameters
848+
----------
849+
nullable : bool, optional
850+
Use ``False`` to mark this field as non-nullable.
851+
852+
Examples
853+
--------
854+
855+
>>> import nanoarrow as na
856+
>>> na.binary_view()
857+
<Schema> binary_view
858+
"""
859+
return Schema(Type.BINARY_VIEW, nullable=nullable)
860+
861+
823862
def fixed_size_binary(byte_width: int, nullable: bool = True) -> Schema:
824863
"""Create an instance of a variable or fixed-width binary type.
825864

python/tests/test_schema.py

+2
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ def test_schema_simple():
107107
assert na.interval_months().type == na.Type.INTERVAL_MONTHS
108108
assert na.interval_day_time().type == na.Type.INTERVAL_DAY_TIME
109109
assert na.interval_month_day_nano().type == na.Type.INTERVAL_MONTH_DAY_NANO
110+
assert na.binary_view().type == na.Type.BINARY_VIEW
111+
assert na.string_view().type == na.Type.STRING_VIEW
110112

111113

112114
def test_schema_fixed_size_binary():

src/nanoarrow/common/array.c

+59-7
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <errno.h>
1919
#include <inttypes.h>
2020
#include <stdarg.h>
21+
#include <stdbool.h>
2122
#include <stdio.h>
2223
#include <stdlib.h>
2324
#include <string.h>
@@ -32,6 +33,12 @@ static void ArrowArrayReleaseInternal(struct ArrowArray* array) {
3233
ArrowBitmapReset(&private_data->bitmap);
3334
ArrowBufferReset(&private_data->buffers[0]);
3435
ArrowBufferReset(&private_data->buffers[1]);
36+
ArrowFree(private_data->buffer_data);
37+
for (int32_t i = 0; i < private_data->n_variadic_buffers; ++i) {
38+
ArrowBufferReset(&private_data->variadic_buffers[i]);
39+
}
40+
ArrowFree(private_data->variadic_buffers);
41+
ArrowFree(private_data->variadic_buffer_sizes);
3542
ArrowFree(private_data);
3643
}
3744

@@ -106,6 +113,10 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array,
106113
case NANOARROW_TYPE_DENSE_UNION:
107114
array->n_buffers = 2;
108115
break;
116+
case NANOARROW_TYPE_BINARY_VIEW:
117+
case NANOARROW_TYPE_STRING_VIEW:
118+
array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS;
119+
break;
109120
case NANOARROW_TYPE_STRING:
110121
case NANOARROW_TYPE_LARGE_STRING:
111122
case NANOARROW_TYPE_BINARY:
@@ -148,12 +159,17 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array,
148159
ArrowBitmapInit(&private_data->bitmap);
149160
ArrowBufferInit(&private_data->buffers[0]);
150161
ArrowBufferInit(&private_data->buffers[1]);
151-
private_data->buffer_data[0] = NULL;
152-
private_data->buffer_data[1] = NULL;
153-
private_data->buffer_data[2] = NULL;
162+
private_data->buffer_data =
163+
(const void**)ArrowMalloc(sizeof(void*) * NANOARROW_MAX_FIXED_BUFFERS);
164+
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; ++i) {
165+
private_data->buffer_data[i] = NULL;
166+
}
167+
private_data->n_variadic_buffers = 0;
168+
private_data->variadic_buffers = NULL;
169+
private_data->variadic_buffer_sizes = NULL;
154170

155171
array->private_data = private_data;
156-
array->buffers = (const void**)(&private_data->buffer_data);
172+
array->buffers = (const void**)(private_data->buffer_data);
157173

158174
// These are not technically "storage" in the sense that they do not appear
159175
// in the ArrowSchemaView's storage_type member; however, allowing them here
@@ -456,10 +472,26 @@ static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) {
456472
struct ArrowArrayPrivateData* private_data =
457473
(struct ArrowArrayPrivateData*)array->private_data;
458474

459-
for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
475+
const bool is_binary_view = private_data->storage_type == NANOARROW_TYPE_STRING_VIEW ||
476+
private_data->storage_type == NANOARROW_TYPE_BINARY_VIEW;
477+
const int32_t nfixed_buf = is_binary_view ? 2 : NANOARROW_MAX_FIXED_BUFFERS;
478+
479+
for (int32_t i = 0; i < nfixed_buf; i++) {
460480
private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data;
461481
}
462482

483+
if (is_binary_view) {
484+
const int32_t nvirt_buf = private_data->n_variadic_buffers;
485+
private_data->buffer_data = (const void**)ArrowRealloc(
486+
private_data->buffer_data, sizeof(void*) * (nfixed_buf + nvirt_buf + 1));
487+
for (int32_t i = 0; i < nvirt_buf; i++) {
488+
private_data->buffer_data[nfixed_buf + i] = private_data->variadic_buffers[i].data;
489+
}
490+
private_data->buffer_data[nfixed_buf + nvirt_buf] =
491+
private_data->variadic_buffer_sizes;
492+
array->buffers = (const void**)(private_data->buffer_data);
493+
}
494+
463495
for (int64_t i = 0; i < array->n_children; i++) {
464496
ArrowArrayFlushInternalPointers(array->children[i]);
465497
}
@@ -664,6 +696,7 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length)
664696
_ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * length) /
665697
8;
666698
continue;
699+
case NANOARROW_BUFFER_TYPE_DATA_VIEW:
667700
case NANOARROW_BUFFER_TYPE_TYPE_ID:
668701
case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
669702
array_view->buffer_views[i].size_bytes = element_size_bytes * length;
@@ -700,9 +733,15 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
700733
array_view->offset = array->offset;
701734
array_view->length = array->length;
702735
array_view->null_count = array->null_count;
736+
array_view->variadic_buffer_sizes = NULL;
737+
array_view->n_variadic_buffers = 0;
703738

704739
int64_t buffers_required = 0;
705-
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
740+
const int nfixed_buf = array_view->storage_type == NANOARROW_TYPE_STRING_VIEW ||
741+
array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW
742+
? NANOARROW_BINARY_VIEW_FIXED_BUFFERS
743+
: NANOARROW_MAX_FIXED_BUFFERS;
744+
for (int i = 0; i < nfixed_buf; i++) {
706745
if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
707746
break;
708747
}
@@ -720,7 +759,19 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
720759
}
721760
}
722761

723-
// Check the number of buffers
762+
if (array_view->storage_type == NANOARROW_TYPE_STRING_VIEW ||
763+
array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) {
764+
const int64_t n_buffers = array->n_buffers;
765+
const int32_t nfixed_buf = NANOARROW_BINARY_VIEW_FIXED_BUFFERS;
766+
767+
const int32_t nvariadic_buf = (int32_t)(n_buffers - nfixed_buf - 1);
768+
if (nvariadic_buf > 0) {
769+
array_view->n_variadic_buffers = nvariadic_buf;
770+
buffers_required += nvariadic_buf + 1;
771+
array_view->variadic_buffer_sizes = (int64_t*)array->buffers[n_buffers - 1];
772+
}
773+
}
774+
724775
if (buffers_required != array->n_buffers) {
725776
ArrowErrorSet(error,
726777
"Expected array with %" PRId64 " buffer(s) but found %" PRId64
@@ -814,6 +865,7 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
814865
break;
815866
case NANOARROW_BUFFER_TYPE_TYPE_ID:
816867
case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
868+
case NANOARROW_BUFFER_TYPE_DATA_VIEW:
817869
min_buffer_size_bytes = element_size_bytes * offset_plus_length;
818870
break;
819871
case NANOARROW_BUFFER_TYPE_NONE:

0 commit comments

Comments
 (0)