Skip to content

Commit 57acfbe

Browse files
authored
fix: Ignore empty (but present) union validity bitmaps from before 1.0 (#630)
For metadata versions before V5, ignore the extra validity bitmap. If the bitmap is non-empty, we raise an error (which is the same bail used in arrow-C++ and -java).
1 parent 61c6917 commit 57acfbe

File tree

1 file changed

+40
-8
lines changed

1 file changed

+40
-8
lines changed

src/nanoarrow/ipc/decoder.c

+40-8
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ struct ArrowIpcDecoderPrivate {
9595
// The number of buffers that future RecordBatch messages must have to match the schema
9696
// that has been set.
9797
int64_t n_buffers;
98+
// The number of union fields in the Schema.
99+
int64_t n_union_fields;
98100
// A pointer to the last flatbuffers message.
99101
const void* last_message;
100102
// Storage for a Footer
@@ -261,6 +263,8 @@ void ArrowIpcDecoderReset(struct ArrowIpcDecoder* decoder) {
261263
private_data->n_fields = 0;
262264
}
263265

266+
private_data->n_union_fields = 0;
267+
264268
ArrowIpcFooterReset(&private_data->footer);
265269

266270
ArrowFree(private_data);
@@ -924,9 +928,16 @@ static int ArrowIpcDecoderDecodeRecordBatchHeader(struct ArrowIpcDecoder* decode
924928
return EINVAL;
925929
}
926930

927-
if ((n_buffers + 1) != private_data->n_buffers) {
931+
int64_t n_expected_buffers = private_data->n_buffers;
932+
if (decoder->metadata_version < NANOARROW_IPC_METADATA_VERSION_V5) {
933+
// Unions had null buffers before arrow 1.0, so expect one extra buffer per union
934+
// field
935+
n_expected_buffers += private_data->n_union_fields;
936+
}
937+
938+
if ((n_buffers + 1) != n_expected_buffers) {
928939
ArrowErrorSet(error, "Expected %" PRId64 " buffers in message but found %" PRId64,
929-
private_data->n_buffers - 1, n_buffers);
940+
n_expected_buffers - 1, n_buffers);
930941
return EINVAL;
931942
}
932943

@@ -1179,14 +1190,14 @@ ArrowErrorCode ArrowIpcDecoderDecodeHeader(struct ArrowIpcDecoder* decoder,
11791190

11801191
switch (decoder->metadata_version) {
11811192
case ns(MetadataVersion_V5):
1193+
case ns(MetadataVersion_V4):
11821194
break;
11831195
case ns(MetadataVersion_V1):
11841196
case ns(MetadataVersion_V2):
11851197
case ns(MetadataVersion_V3):
1186-
case ns(MetadataVersion_V4):
1187-
ArrowErrorSet(error, "Expected metadata version V5 but found %s",
1198+
ArrowErrorSet(error, "Expected metadata version V4 or V5 but found %s",
11881199
ns(MetadataVersion_name(ns(Message_version(message)))));
1189-
break;
1200+
return EINVAL;
11901201
default:
11911202
ArrowErrorSet(error, "Unexpected value for Message metadata version (%d)",
11921203
decoder->metadata_version);
@@ -1307,7 +1318,7 @@ static void ArrowIpcDecoderCountFields(struct ArrowSchema* schema, int64_t* n_fi
13071318
static void ArrowIpcDecoderInitFields(struct ArrowIpcField* fields,
13081319
struct ArrowArrayView* array_view,
13091320
struct ArrowArray* array, int64_t* n_fields,
1310-
int64_t* n_buffers) {
1321+
int64_t* n_buffers, int64_t* n_union_fields) {
13111322
struct ArrowIpcField* field = fields + (*n_fields);
13121323
field->array_view = array_view;
13131324
field->array = array;
@@ -1316,12 +1327,14 @@ static void ArrowIpcDecoderInitFields(struct ArrowIpcField* fields,
13161327
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
13171328
*n_buffers += array_view->layout.buffer_type[i] != NANOARROW_BUFFER_TYPE_NONE;
13181329
}
1330+
*n_union_fields += array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION ||
1331+
array_view->storage_type == NANOARROW_TYPE_DENSE_UNION;
13191332

13201333
*n_fields += 1;
13211334

13221335
for (int64_t i = 0; i < array_view->n_children; i++) {
13231336
ArrowIpcDecoderInitFields(fields, array_view->children[i], array->children[i],
1324-
n_fields, n_buffers);
1337+
n_fields, n_buffers, n_union_fields);
13251338
}
13261339
}
13271340

@@ -1334,6 +1347,7 @@ ArrowErrorCode ArrowIpcDecoderSetSchema(struct ArrowIpcDecoder* decoder,
13341347
// Reset previously allocated schema-specific resources
13351348
private_data->n_buffers = 0;
13361349
private_data->n_fields = 0;
1350+
private_data->n_union_fields = 0;
13371351
ArrowArrayViewReset(&private_data->array_view);
13381352
if (private_data->array.release != NULL) {
13391353
ArrowArrayRelease(&private_data->array);
@@ -1368,7 +1382,8 @@ ArrowErrorCode ArrowIpcDecoderSetSchema(struct ArrowIpcDecoder* decoder,
13681382
// Init field information and calculate starting buffer offset for each
13691383
int64_t field_i = 0;
13701384
ArrowIpcDecoderInitFields(private_data->fields, &private_data->array_view,
1371-
&private_data->array, &field_i, &private_data->n_buffers);
1385+
&private_data->array, &field_i, &private_data->n_buffers,
1386+
&private_data->n_union_fields);
13721387

13731388
return NANOARROW_OK;
13741389
}
@@ -1604,6 +1619,7 @@ struct ArrowIpcArraySetter {
16041619
int64_t body_size_bytes;
16051620
struct ArrowIpcBufferSource src;
16061621
struct ArrowIpcBufferFactory factory;
1622+
enum ArrowIpcMetadataVersion version;
16071623
};
16081624

16091625
static int ArrowIpcDecoderMakeBuffer(struct ArrowIpcArraySetter* setter, int64_t offset,
@@ -1691,6 +1707,21 @@ static int ArrowIpcDecoderWalkSetArrayView(struct ArrowIpcArraySetter* setter,
16911707
array_view->null_count = ns(FieldNode_null_count(field));
16921708
setter->field_i += 1;
16931709

1710+
if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION ||
1711+
array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) {
1712+
if (setter->version < NANOARROW_IPC_METADATA_VERSION_V5) {
1713+
ns(Buffer_struct_t) buffer =
1714+
ns(Buffer_vec_at(setter->buffers, (size_t)setter->buffer_i));
1715+
if (ns(Buffer_length(buffer)) != 0) {
1716+
ArrowErrorSet(error,
1717+
"Cannot read pre-1.0.0 Union array with top-level validity bitmap");
1718+
return EINVAL;
1719+
}
1720+
// skip the empty validity bitmap
1721+
setter->buffer_i += 1;
1722+
}
1723+
}
1724+
16941725
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
16951726
if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
16961727
break;
@@ -1803,6 +1834,7 @@ static ArrowErrorCode ArrowIpcDecoderDecodeArrayViewInternal(
18031834
setter.factory = factory;
18041835
setter.src.codec = decoder->codec;
18051836
setter.src.swap_endian = ArrowIpcDecoderNeedsSwapEndian(decoder);
1837+
setter.version = decoder->metadata_version;
18061838

18071839
// The flatbuffers FieldNode doesn't count the root struct so we have to loop over the
18081840
// children ourselves

0 commit comments

Comments
 (0)