From fe7e71ac23466a2a4dc142d73e5ec2008ca3e222 Mon Sep 17 00:00:00 2001 From: Michael Maletich Date: Fri, 29 Nov 2024 11:07:05 -0600 Subject: [PATCH] fix: Encoding of List offsets was incorrect when slice offsets begin with zero (#6805) * fix: Encoding of List offsets was incorrect when slice offsets begin with zero When encoding offsets the code had an optimization to reuse the offsets if the first offset was zero assuming the slice already pointed to first element. But the offset can also be zero if all previous lists were empty. When this occured it mold make all lists in the slice as empty, even if they shouldn't be. * Use Buffer::from_slice_ref which will be faster as it doesn't iterate through the slice. * Avoid copying * Explicitly reference std::mem::size_of --- arrow-ipc/src/writer.rs | 52 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 23cefede7b3..dfaf12892a7 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -23,6 +23,7 @@ use std::cmp::min; use std::collections::HashMap; use std::io::{BufWriter, Write}; +use std::mem::size_of; use std::sync::Arc; use flatbuffers::FlatBufferBuilder; @@ -1430,7 +1431,13 @@ fn reencode_offsets( let end_offset = offset_slice.last().unwrap(); let offsets = match start_offset.as_usize() { - 0 => offsets.clone(), + 0 => { + let size = size_of::(); + offsets.slice_with_length( + data.offset() * size, + (data.offset() + data.len() + 1) * size, + ) + } _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), }; @@ -2517,6 +2524,36 @@ mod tests { ls.finish() } + fn generate_nested_list_data_starting_at_zero() -> GenericListArray { + let mut ls = + GenericListBuilder::::new(GenericListBuilder::::new(UInt32Builder::new())); + + for _i in 0..999 { + ls.values().append(true); + ls.append(true); + } + + for j in 0..10 { + for value in [j, j, j, j] { + ls.values().values().append_value(value); + } + ls.values().append(true) + } + ls.append(true); + + for i in 0..9_000 { + for j in 0..10 { + for value in [i + j, i + j, i + j, i + j] { + ls.values().values().append_value(value); + } + ls.values().append(true) + } + ls.append(true); + } + + ls.finish() + } + fn generate_map_array_data() -> MapArray { let keys_builder = UInt32Builder::new(); let values_builder = UInt32Builder::new(); @@ -2608,6 +2645,19 @@ mod tests { roundtrip_ensure_sliced_smaller(in_batch, 1000); } + #[test] + fn encode_nested_lists_starting_at_zero() { + let inner_int = Arc::new(Field::new("item", DataType::UInt32, true)); + let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_int), true)); + let list_field = Field::new("val", DataType::List(inner_list_field), true); + let schema = Arc::new(Schema::new(vec![list_field])); + + let values = Arc::new(generate_nested_list_data_starting_at_zero::()); + + let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + roundtrip_ensure_sliced_smaller(in_batch, 1); + } + #[test] fn encode_map_array() { let keys = Arc::new(Field::new("keys", DataType::UInt32, false));