Skip to content

Commit 5d53738

Browse files
paleolimbotbkietz
andauthored
fix: Improve validation of offset buffers for sliced arrays (#626)
This PR updates validation of offset buffers in string/binary, large string/binary, list, and large list types such that portions of the buffers not strictly required by the slice referenced by `offset` and `length` are not accessed/validated. This came up in the IPC integration tests because C# exports full buffers rather than the portions of buffers strictly required. This highlights how our test suite would benefit from helpers to reduce repetition (although I'd prefer to handle that at a later time if possible). --------- Co-authored-by: Benjamin Kietzman <[email protected]>
1 parent 499d865 commit 5d53738

File tree

4 files changed

+211
-23
lines changed

4 files changed

+211
-23
lines changed

.github/workflows/integration.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ on:
2525
pull_request:
2626
paths:
2727
- .github/workflows/integration.yaml
28+
- ci/docker/integration.dockerfile
2829
- docker-compose.yml
2930
schedule:
3031
- cron: '5 0 * * 0'

docker-compose.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ services:
5454
volumes:
5555
- ${NANOARROW_DOCKER_SOURCE_DIR}:/arrow-integration/nanoarrow
5656
environment:
57-
ARCHERY_INTEGRATION_TARGET_LANGUAGES: "nanoarrow"
57+
ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS: "nanoarrow"
5858
command:
5959
["echo '::group::Build nanoarrow' &&
6060
conda run --no-capture-output /arrow-integration/ci/scripts/nanoarrow_build.sh /arrow-integration /build &&

src/nanoarrow/common/array.c

+41-8
Original file line numberDiff line numberDiff line change
@@ -990,14 +990,19 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
990990
case NANOARROW_TYPE_STRING:
991991
case NANOARROW_TYPE_BINARY:
992992
if (array_view->buffer_views[1].size_bytes != 0) {
993-
first_offset = array_view->buffer_views[1].data.as_int32[0];
993+
first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset];
994994
if (first_offset < 0) {
995995
ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64,
996996
first_offset);
997997
return EINVAL;
998998
}
999999

10001000
last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length];
1001+
if (last_offset < 0) {
1002+
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
1003+
last_offset);
1004+
return EINVAL;
1005+
}
10011006

10021007
// If the data buffer size is unknown, assign it; otherwise, check it
10031008
if (array_view->buffer_views[2].size_bytes == -1) {
@@ -1021,14 +1026,19 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
10211026
case NANOARROW_TYPE_LARGE_STRING:
10221027
case NANOARROW_TYPE_LARGE_BINARY:
10231028
if (array_view->buffer_views[1].size_bytes != 0) {
1024-
first_offset = array_view->buffer_views[1].data.as_int64[0];
1029+
first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset];
10251030
if (first_offset < 0) {
10261031
ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64,
10271032
first_offset);
10281033
return EINVAL;
10291034
}
10301035

10311036
last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length];
1037+
if (last_offset < 0) {
1038+
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
1039+
last_offset);
1040+
return EINVAL;
1041+
}
10321042

10331043
// If the data buffer size is unknown, assign it; otherwise, check it
10341044
if (array_view->buffer_views[2].size_bytes == -1) {
@@ -1065,14 +1075,20 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
10651075
case NANOARROW_TYPE_LIST:
10661076
case NANOARROW_TYPE_MAP:
10671077
if (array_view->buffer_views[1].size_bytes != 0) {
1068-
first_offset = array_view->buffer_views[1].data.as_int32[0];
1078+
first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset];
10691079
if (first_offset < 0) {
10701080
ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64,
10711081
first_offset);
10721082
return EINVAL;
10731083
}
10741084

10751085
last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length];
1086+
if (last_offset < 0) {
1087+
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
1088+
last_offset);
1089+
return EINVAL;
1090+
}
1091+
10761092
if (array_view->children[0]->length < last_offset) {
10771093
ArrowErrorSet(error,
10781094
"Expected child of %s array to have length >= %" PRId64
@@ -1087,14 +1103,20 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
10871103

10881104
case NANOARROW_TYPE_LARGE_LIST:
10891105
if (array_view->buffer_views[1].size_bytes != 0) {
1090-
first_offset = array_view->buffer_views[1].data.as_int64[0];
1106+
first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset];
10911107
if (first_offset < 0) {
10921108
ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64,
10931109
first_offset);
10941110
return EINVAL;
10951111
}
10961112

10971113
last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length];
1114+
if (last_offset < 0) {
1115+
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
1116+
last_offset);
1117+
return EINVAL;
1118+
}
1119+
10981120
if (array_view->children[0]->length < last_offset) {
10991121
ArrowErrorSet(error,
11001122
"Expected child of large list array to have length >= %" PRId64
@@ -1249,13 +1271,24 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view,
12491271
struct ArrowError* error) {
12501272
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
12511273
switch (array_view->layout.buffer_type[i]) {
1274+
// Only validate the portion of the buffer that is strictly required,
1275+
// which includes not validating the offset buffer of a zero-length array.
12521276
case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
1277+
if (array_view->length == 0) {
1278+
continue;
1279+
}
12531280
if (array_view->layout.element_size_bits[i] == 32) {
1254-
NANOARROW_RETURN_NOT_OK(
1255-
ArrowAssertIncreasingInt32(array_view->buffer_views[i], error));
1281+
struct ArrowBufferView offset_minimal;
1282+
offset_minimal.data.as_int32 =
1283+
array_view->buffer_views[i].data.as_int32 + array_view->offset;
1284+
offset_minimal.size_bytes = (array_view->length + 1) * sizeof(int32_t);
1285+
NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt32(offset_minimal, error));
12561286
} else {
1257-
NANOARROW_RETURN_NOT_OK(
1258-
ArrowAssertIncreasingInt64(array_view->buffer_views[i], error));
1287+
struct ArrowBufferView offset_minimal;
1288+
offset_minimal.data.as_int64 =
1289+
array_view->buffer_views[i].data.as_int64 + array_view->offset;
1290+
offset_minimal.size_bytes = (array_view->length + 1) * sizeof(int64_t);
1291+
NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt64(offset_minimal, error));
12591292
}
12601293
break;
12611294
default:

0 commit comments

Comments
 (0)