Skip to content

Commit

Permalink
fix: bypass the arrow take for struct array (#3500)
Browse files Browse the repository at this point in the history
  • Loading branch information
BubbleCal authored Mar 4, 2025
1 parent dca745b commit eb16635
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 4 deletions.
9 changes: 9 additions & 0 deletions python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2983,3 +2983,12 @@ def test_data_replacement(tmp_path: Path):
}
)
assert tbl == expected


def test_empty_structs(tmp_path):
schema = pa.schema([pa.field("id", pa.int32()), pa.field("empties", pa.struct([]))])
table = pa.table({"id": [0, 1, 2], "empties": [{}] * 3}, schema=schema)
ds = lance.write_dataset(table, tmp_path)
res = ds.take([2, 0, 1])
assert res.num_rows == 3
assert res == table.take([2, 0, 1])
49 changes: 45 additions & 4 deletions rust/lance/src/dataset/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ use std::{collections::BTreeMap, ops::Range, pin::Pin, sync::Arc};
use crate::dataset::fragment::FragReadConfig;
use crate::dataset::rowids::get_row_id_index;
use crate::{Error, Result};
use arrow::{array::as_struct_array, compute::concat_batches, datatypes::UInt64Type};
use arrow::{compute::concat_batches, datatypes::UInt64Type};
use arrow_array::cast::AsArray;
use arrow_array::{RecordBatch, StructArray, UInt64Array};
use arrow_array::{Array, RecordBatch, StructArray, UInt64Array};
use arrow_buffer::{ArrowNativeType, BooleanBuffer, Buffer, NullBuffer};
use arrow_schema::{Field as ArrowField, Schema as ArrowSchema};
use datafusion::error::DataFusionError;
use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
Expand Down Expand Up @@ -283,9 +284,13 @@ async fn do_take_rows(
// Remove the rowaddr column.
let keep_indices = (0..one_batch.num_columns() - 1).collect::<Vec<_>>();
let one_batch = one_batch.project(&keep_indices)?;

// There's a bug in arrow_select::take::take, that it doesn't handle empty struct correctly,
// so we need to handle it manually here.
// TODO: remove this once the bug is fixed.
let struct_arr: StructArray = one_batch.into();
let reordered = arrow_select::take::take(&struct_arr, &remapping_index, None)?;
Ok(as_struct_array(&reordered).into())
let reordered = take_struct_array(&struct_arr, &remapping_index)?;
Ok(reordered.into())
}?;

let batch = projection.project_batch(batch).await?;
Expand Down Expand Up @@ -553,6 +558,42 @@ impl TakeBuilder {
}
}

fn take_struct_array(array: &StructArray, indices: &UInt64Array) -> Result<StructArray> {
let nulls = array.nulls().map(|nulls| {
let is_valid = indices.iter().map(|index| {
if let Some(index) = index {
nulls.is_valid(index.to_usize().unwrap())
} else {
false
}
});
NullBuffer::new(BooleanBuffer::new(
Buffer::from_iter(is_valid),
0,
indices.len(),
))
});

if array.fields().is_empty() {
return Ok(StructArray::new_empty_fields(indices.len(), nulls));
}

let arrays = array
.columns()
.iter()
.map(|array| {
let array = match array.data_type() {
arrow::datatypes::DataType::Struct(_) => {
Arc::new(take_struct_array(array.as_struct(), indices)?)
}
_ => arrow_select::take::take(array, indices, None)?,
};
Ok(array)
})
.collect::<Result<Vec<_>>>()?;
Ok(StructArray::new(array.fields().clone(), arrays, nulls))
}

#[cfg(test)]
mod test {
use arrow_array::{Int32Array, RecordBatchIterator, StringArray};
Expand Down

0 comments on commit eb16635

Please sign in to comment.