Skip to content

Commit

Permalink
fix: Encoding of List offsets was incorrect when slice offsets begin …
Browse files Browse the repository at this point in the history
…with zero (#6805)

* fix: Encoding of List offsets was incorrect when slice offsets begin with zero

When encoding offsets the code had an optimization to reuse the offsets if the first offset was zero assuming the slice already pointed
 to first element. But the offset can also be zero if all previous lists were empty. When this occured it mold make all lists in the
slice as empty, even if they shouldn't be.

* Use Buffer::from_slice_ref which will be faster as it doesn't iterate through the slice.

* Avoid copying

* Explicitly reference std::mem::size_of
  • Loading branch information
HawaiianSpork authored Nov 29, 2024
1 parent 283fb4b commit fe7e71a
Showing 1 changed file with 51 additions and 1 deletion.
52 changes: 51 additions & 1 deletion arrow-ipc/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
use std::cmp::min;
use std::collections::HashMap;
use std::io::{BufWriter, Write};
use std::mem::size_of;
use std::sync::Arc;

use flatbuffers::FlatBufferBuilder;
Expand Down Expand Up @@ -1430,7 +1431,13 @@ fn reencode_offsets<O: OffsetSizeTrait>(
let end_offset = offset_slice.last().unwrap();

let offsets = match start_offset.as_usize() {
0 => offsets.clone(),
0 => {
let size = size_of::<O>();
offsets.slice_with_length(
data.offset() * size,
(data.offset() + data.len() + 1) * size,
)
}
_ => offset_slice.iter().map(|x| *x - *start_offset).collect(),
};

Expand Down Expand Up @@ -2517,6 +2524,36 @@ mod tests {
ls.finish()
}

fn generate_nested_list_data_starting_at_zero<O: OffsetSizeTrait>() -> GenericListArray<O> {
let mut ls =
GenericListBuilder::<O, _>::new(GenericListBuilder::<O, _>::new(UInt32Builder::new()));

for _i in 0..999 {
ls.values().append(true);
ls.append(true);
}

for j in 0..10 {
for value in [j, j, j, j] {
ls.values().values().append_value(value);
}
ls.values().append(true)
}
ls.append(true);

for i in 0..9_000 {
for j in 0..10 {
for value in [i + j, i + j, i + j, i + j] {
ls.values().values().append_value(value);
}
ls.values().append(true)
}
ls.append(true);
}

ls.finish()
}

fn generate_map_array_data() -> MapArray {
let keys_builder = UInt32Builder::new();
let values_builder = UInt32Builder::new();
Expand Down Expand Up @@ -2608,6 +2645,19 @@ mod tests {
roundtrip_ensure_sliced_smaller(in_batch, 1000);
}

#[test]
fn encode_nested_lists_starting_at_zero() {
let inner_int = Arc::new(Field::new("item", DataType::UInt32, true));
let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_int), true));
let list_field = Field::new("val", DataType::List(inner_list_field), true);
let schema = Arc::new(Schema::new(vec![list_field]));

let values = Arc::new(generate_nested_list_data_starting_at_zero::<i32>());

let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap();
roundtrip_ensure_sliced_smaller(in_batch, 1);
}

#[test]
fn encode_map_array() {
let keys = Arc::new(Field::new("keys", DataType::UInt32, false));
Expand Down

0 comments on commit fe7e71a

Please sign in to comment.