Skip to content

Commit 97e7c61

Browse files
authored
fix: Properly ingest Binary View types without variadic buffers (#635)
closes #634
1 parent 53c9f8f commit 97e7c61

File tree

3 files changed

+129
-43
lines changed

3 files changed

+129
-43
lines changed

src/nanoarrow/common/array.c

+4-6
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array,
115115
break;
116116
case NANOARROW_TYPE_BINARY_VIEW:
117117
case NANOARROW_TYPE_STRING_VIEW:
118-
array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS;
118+
array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1;
119119
break;
120120
case NANOARROW_TYPE_STRING:
121121
case NANOARROW_TYPE_LARGE_STRING:
@@ -765,11 +765,9 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
765765
const int32_t nfixed_buf = NANOARROW_BINARY_VIEW_FIXED_BUFFERS;
766766

767767
const int32_t nvariadic_buf = (int32_t)(n_buffers - nfixed_buf - 1);
768-
if (nvariadic_buf > 0) {
769-
array_view->n_variadic_buffers = nvariadic_buf;
770-
buffers_required += nvariadic_buf + 1;
771-
array_view->variadic_buffer_sizes = (int64_t*)array->buffers[n_buffers - 1];
772-
}
768+
array_view->n_variadic_buffers = nvariadic_buf;
769+
buffers_required += nvariadic_buf + 1;
770+
array_view->variadic_buffer_sizes = (int64_t*)array->buffers[n_buffers - 1];
773771
}
774772

775773
if (buffers_required != array->n_buffers) {

src/nanoarrow/common/array_test.cc

+117-32
Original file line numberDiff line numberDiff line change
@@ -895,9 +895,48 @@ TEST(ArrayTest, ArrayTestAppendToLargeStringArray) {
895895
ArrowArrayRelease(&array);
896896
}
897897

898-
template <enum ArrowType ArrowT, typename ValueT,
899-
ArrowErrorCode (*AppendFunc)(struct ArrowArray*, ValueT)>
900-
void TestAppendToDataViewArray() {
898+
template <enum ArrowType ArrowT, typename ValueT>
899+
void TestAppendToInlinedDataViewArray(
900+
std::function<ArrowErrorCode(struct ArrowArray*, ValueT)> AppendFunc) {
901+
struct ArrowArray array;
902+
903+
ASSERT_EQ(ArrowArrayInitFromType(&array, ArrowT), NANOARROW_OK);
904+
EXPECT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
905+
906+
// Check that we can reserve
907+
ASSERT_EQ(ArrowArrayReserve(&array, 5), NANOARROW_OK);
908+
EXPECT_EQ(ArrowArrayBuffer(&array, 1)->capacity_bytes,
909+
5 * sizeof(union ArrowBinaryView));
910+
911+
EXPECT_EQ(AppendFunc(&array, ValueT{{"inlinestring"}, 12}), NANOARROW_OK);
912+
EXPECT_EQ(ArrowArrayAppendNull(&array, 2), NANOARROW_OK);
913+
EXPECT_EQ(ArrowArrayAppendEmpty(&array, 1), NANOARROW_OK);
914+
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
915+
916+
EXPECT_EQ(array.length, 4);
917+
EXPECT_EQ(array.null_count, 2);
918+
EXPECT_EQ(array.n_buffers, 3);
919+
auto validity_buffer = reinterpret_cast<const uint8_t*>(array.buffers[0]);
920+
auto inline_buffer = reinterpret_cast<const union ArrowBinaryView*>(array.buffers[1]);
921+
auto sizes_buffer = reinterpret_cast<const int64_t*>(array.buffers[2]);
922+
923+
EXPECT_EQ(validity_buffer[0], 0b00001001);
924+
EXPECT_EQ(memcmp(inline_buffer[0].inlined.data, "inlinestring", 12), 0);
925+
EXPECT_EQ(inline_buffer[0].inlined.size, 12);
926+
927+
EXPECT_EQ(sizes_buffer, nullptr);
928+
929+
// TODO: issue #633
930+
/*
931+
EXPECT_THAT(nanoarrow::ViewArrayAsBytes<64>(&array),
932+
ElementsAre("1234"_asv, NA, NA, "56789"_asv, ""_asv));
933+
*/
934+
ArrowArrayRelease(&array);
935+
};
936+
937+
template <enum ArrowType ArrowT, typename ValueT>
938+
void TestAppendToDataViewArray(
939+
std::function<ArrowErrorCode(struct ArrowArray*, ValueT)> AppendFunc) {
901940
struct ArrowArray array;
902941

903942
ASSERT_EQ(ArrowArrayInitFromType(&array, ArrowT), NANOARROW_OK);
@@ -925,6 +964,7 @@ void TestAppendToDataViewArray() {
925964

926965
EXPECT_EQ(array.length, 7);
927966
EXPECT_EQ(array.null_count, 2);
967+
EXPECT_EQ(array.n_buffers, 5);
928968
auto validity_buffer = reinterpret_cast<const uint8_t*>(array.buffers[0]);
929969
auto inline_buffer = reinterpret_cast<const union ArrowBinaryView*>(array.buffers[1]);
930970
auto vbuf1 = reinterpret_cast<const char*>(array.buffers[2]);
@@ -964,13 +1004,17 @@ void TestAppendToDataViewArray() {
9641004
};
9651005

9661006
TEST(ArrayTest, ArrayTestAppendToBinaryViewArray) {
967-
TestAppendToDataViewArray<NANOARROW_TYPE_STRING_VIEW, struct ArrowStringView,
968-
ArrowArrayAppendString>();
1007+
TestAppendToInlinedDataViewArray<NANOARROW_TYPE_STRING_VIEW, struct ArrowStringView>(
1008+
ArrowArrayAppendString);
1009+
TestAppendToDataViewArray<NANOARROW_TYPE_STRING_VIEW, struct ArrowStringView>(
1010+
ArrowArrayAppendString);
9691011
};
9701012

9711013
TEST(ArrayTest, ArrayTestAppendToStringViewArray) {
972-
TestAppendToDataViewArray<NANOARROW_TYPE_BINARY_VIEW, struct ArrowBufferView,
973-
ArrowArrayAppendBytes>();
1014+
TestAppendToInlinedDataViewArray<NANOARROW_TYPE_BINARY_VIEW, struct ArrowBufferView>(
1015+
ArrowArrayAppendBytes);
1016+
TestAppendToDataViewArray<NANOARROW_TYPE_BINARY_VIEW, struct ArrowBufferView>(
1017+
ArrowArrayAppendBytes);
9741018
};
9751019

9761020
TEST(ArrayTest, ArrayTestAppendToFixedSizeBinaryArray) {
@@ -3343,8 +3387,49 @@ TEST(ArrayViewTest, ArrayViewTestGetString) {
33433387
TestGetFromBinary<FixedSizeBinaryBuilder>(fixed_size_builder);
33443388
}
33453389

3346-
template <typename BuilderClass>
3347-
void TestGetFromBinaryView(BuilderClass& builder) {
3390+
template <typename BuilderClass, typename ValueT>
3391+
void TestGetFromInlinedBinaryView(
3392+
BuilderClass& builder,
3393+
std::function<ValueT(const struct ArrowArrayView*, int64_t)> GetValueFunc,
3394+
std::function<const void*(const ValueT*)> GetValueDataFunc) {
3395+
struct ArrowArray array;
3396+
struct ArrowSchema schema;
3397+
struct ArrowArrayView array_view;
3398+
struct ArrowError error;
3399+
3400+
auto type = builder.type();
3401+
ARROW_EXPECT_OK(builder.Append("1234"));
3402+
ARROW_EXPECT_OK(builder.AppendNulls(2));
3403+
ARROW_EXPECT_OK(builder.Append("four"));
3404+
3405+
auto maybe_arrow_array = builder.Finish();
3406+
ARROW_EXPECT_OK(maybe_arrow_array);
3407+
auto arrow_array = maybe_arrow_array.ValueUnsafe();
3408+
3409+
ARROW_EXPECT_OK(ExportArray(*arrow_array, &array, &schema));
3410+
ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, &error), NANOARROW_OK);
3411+
ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
3412+
EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error),
3413+
NANOARROW_OK);
3414+
3415+
EXPECT_EQ(array_view.n_variadic_buffers, 0);
3416+
EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), 1);
3417+
EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 3), 0);
3418+
3419+
const auto value = GetValueFunc(&array_view, 3);
3420+
EXPECT_EQ(value.size_bytes, strlen("four"));
3421+
EXPECT_EQ(memcmp(GetValueDataFunc(&value), "four", value.size_bytes), 0);
3422+
3423+
ArrowArrayViewReset(&array_view);
3424+
ArrowArrayRelease(&array);
3425+
ArrowSchemaRelease(&schema);
3426+
}
3427+
3428+
template <typename BuilderClass, typename ValueT>
3429+
void TestGetFromBinaryView(
3430+
BuilderClass& builder,
3431+
std::function<ValueT(const struct ArrowArrayView*, int64_t)> GetValueFunc,
3432+
std::function<const void*(const ValueT*)> GetValueDataFunc) {
33483433
struct ArrowArray array;
33493434
struct ArrowSchema schema;
33503435
struct ArrowArrayView array_view;
@@ -3380,29 +3465,17 @@ void TestGetFromBinaryView(BuilderClass& builder) {
33803465
EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), 1);
33813466
EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 3), 0);
33823467

3383-
auto string_view = ArrowArrayViewGetStringUnsafe(&array_view, 3);
3384-
EXPECT_EQ(string_view.size_bytes, strlen("four"));
3385-
EXPECT_EQ(memcmp(string_view.data, "four", string_view.size_bytes), 0);
3386-
3387-
auto buffer_view = ArrowArrayViewGetBytesUnsafe(&array_view, 3);
3388-
EXPECT_EQ(buffer_view.size_bytes, strlen("four"));
3389-
EXPECT_EQ(memcmp(buffer_view.data.as_char, "four", buffer_view.size_bytes), 0);
3390-
3391-
string_view = ArrowArrayViewGetStringUnsafe(&array_view, 4);
3392-
EXPECT_EQ(string_view.size_bytes, str1.size());
3393-
EXPECT_EQ(memcmp(string_view.data, str1.c_str(), string_view.size_bytes), 0);
3468+
const auto value1 = GetValueFunc(&array_view, 3);
3469+
EXPECT_EQ(value1.size_bytes, strlen("four"));
3470+
EXPECT_EQ(memcmp(GetValueDataFunc(&value1), "four", value1.size_bytes), 0);
33943471

3395-
string_view = ArrowArrayViewGetStringUnsafe(&array_view, 6);
3396-
EXPECT_EQ(string_view.size_bytes, str2.size());
3397-
EXPECT_EQ(memcmp(string_view.data, str2.c_str(), string_view.size_bytes), 0);
3472+
const auto value2 = GetValueFunc(&array_view, 4);
3473+
EXPECT_EQ(value2.size_bytes, str1.size());
3474+
EXPECT_EQ(memcmp(GetValueDataFunc(&value2), str1.c_str(), value2.size_bytes), 0);
33983475

3399-
buffer_view = ArrowArrayViewGetBytesUnsafe(&array_view, 4);
3400-
EXPECT_EQ(buffer_view.size_bytes, str1.size());
3401-
EXPECT_EQ(memcmp(buffer_view.data.as_char, str1.c_str(), buffer_view.size_bytes), 0);
3402-
3403-
buffer_view = ArrowArrayViewGetBytesUnsafe(&array_view, 6);
3404-
EXPECT_EQ(buffer_view.size_bytes, str2.size());
3405-
EXPECT_EQ(memcmp(buffer_view.data.as_char, str2.c_str(), buffer_view.size_bytes), 0);
3476+
const auto value3 = GetValueFunc(&array_view, 6);
3477+
EXPECT_EQ(value3.size_bytes, str2.size());
3478+
EXPECT_EQ(memcmp(GetValueDataFunc(&value3), str2.c_str(), value3.size_bytes), 0);
34063479

34073480
ArrowArrayViewReset(&array_view);
34083481
ArrowArrayRelease(&array);
@@ -3411,10 +3484,22 @@ void TestGetFromBinaryView(BuilderClass& builder) {
34113484

34123485
TEST(ArrayViewTest, ArrayViewTestGetStringView) {
34133486
auto string_view_builder = StringViewBuilder();
3414-
TestGetFromBinaryView<StringViewBuilder>(string_view_builder);
3487+
const auto get_string_view = [](const struct ArrowStringView* sv) { return sv->data; };
3488+
TestGetFromInlinedBinaryView<StringViewBuilder, struct ArrowStringView>(
3489+
string_view_builder, ArrowArrayViewGetStringUnsafe, get_string_view);
3490+
TestGetFromBinaryView<StringViewBuilder, struct ArrowStringView>(
3491+
string_view_builder, ArrowArrayViewGetStringUnsafe, get_string_view);
3492+
}
34153493

3494+
TEST(ArrayViewTest, ArrayViewTestGetBinaryView) {
34163495
auto binary_view_builder = BinaryViewBuilder();
3417-
TestGetFromBinaryView<BinaryViewBuilder>(binary_view_builder);
3496+
const auto get_buffer_view = [](const struct ArrowBufferView* bv) {
3497+
return bv->data.data;
3498+
};
3499+
TestGetFromInlinedBinaryView<BinaryViewBuilder, struct ArrowBufferView>(
3500+
binary_view_builder, ArrowArrayViewGetBytesUnsafe, get_buffer_view);
3501+
TestGetFromBinaryView<BinaryViewBuilder, struct ArrowBufferView>(
3502+
binary_view_builder, ArrowArrayViewGetBytesUnsafe, get_buffer_view);
34183503
}
34193504

34203505
TEST(ArrayViewTest, ArrayViewTestGetIntervalYearMonth) {

src/nanoarrow/common/inline_array.h

+8-5
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,8 @@ static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array,
468468
return NANOARROW_OK;
469469
}
470470

471+
// Binary views only have two fixed buffers, but be aware that they must also
472+
// always have more 1 buffer to store variadic buffer sizes (even if there are none)
471473
#define NANOARROW_BINARY_VIEW_FIXED_BUFFERS 2
472474
#define NANOARROW_BINARY_VIEW_INLINE_SIZE 12
473475
#define NANOARROW_BINARY_VIEW_PREFIX_SIZE 4
@@ -504,27 +506,28 @@ static inline int32_t ArrowArrayVariadicBufferCount(struct ArrowArray* array) {
504506
static inline ArrowErrorCode ArrowArrayAddVariadicBuffers(struct ArrowArray* array,
505507
int32_t nbuffers) {
506508
const int32_t n_current_bufs = ArrowArrayVariadicBufferCount(array);
507-
const int32_t n_bufs_needed = n_current_bufs + nbuffers;
509+
const int32_t nvariadic_bufs_needed = n_current_bufs + nbuffers;
508510

509511
struct ArrowArrayPrivateData* private_data =
510512
(struct ArrowArrayPrivateData*)array->private_data;
511513

512514
private_data->variadic_buffers = (struct ArrowBuffer*)ArrowRealloc(
513-
private_data->variadic_buffers, sizeof(struct ArrowBuffer) * n_bufs_needed);
515+
private_data->variadic_buffers, sizeof(struct ArrowBuffer) * nvariadic_bufs_needed);
514516
if (private_data->variadic_buffers == NULL) {
515517
return ENOMEM;
516518
}
517519
private_data->variadic_buffer_sizes = (int64_t*)ArrowRealloc(
518-
private_data->variadic_buffer_sizes, sizeof(int64_t) * n_bufs_needed);
520+
private_data->variadic_buffer_sizes, sizeof(int64_t) * nvariadic_bufs_needed);
519521
if (private_data->variadic_buffer_sizes == NULL) {
520522
return ENOMEM;
521523
}
522524

523-
for (int32_t i = n_current_bufs; i < n_bufs_needed; i++) {
525+
for (int32_t i = n_current_bufs; i < nvariadic_bufs_needed; i++) {
524526
ArrowBufferInit(&private_data->variadic_buffers[i]);
525527
private_data->variadic_buffer_sizes[i] = 0;
526528
}
527-
private_data->n_variadic_buffers = n_bufs_needed;
529+
private_data->n_variadic_buffers = nvariadic_bufs_needed;
530+
array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1 + nvariadic_bufs_needed;
528531

529532
return NANOARROW_OK;
530533
}

0 commit comments

Comments
 (0)