Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Array::shrink_to_fit(&mut self) #6790

Merged
merged 13 commits into from
Nov 29, 2024
7 changes: 7 additions & 0 deletions arrow-array/src/array/boolean_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,13 @@ impl Array for BooleanArray {
self.values.is_empty()
}

fn shrink_to_fit(&mut self) {
self.values.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
self.values.offset()
}
Expand Down
8 changes: 8 additions & 0 deletions arrow-array/src/array/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,14 @@ impl<T: ByteArrayType> Array for GenericByteArray<T> {
self.value_offsets.len() <= 1
}

fn shrink_to_fit(&mut self) {
self.value_offsets.shrink_to_fit();
self.value_data.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
0
}
Expand Down
43 changes: 26 additions & 17 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,31 +430,31 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
///
/// Before GC:
/// ```text
/// ┌──────┐
/// │......│
/// │......│
/// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
/// ┌──────┐
/// │......│
/// │......│
/// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
/// │ View 1 │─ ─ ─ ─ │......│ with data that
/// ├────────────────────┤ │......│ is not referred
/// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
/// └────────────────────┘ │......│ View 2
/// │......│
/// 2 views, refer to │......│
/// small portions of a └──────┘
/// large buffer
/// └────────────────────┘ │......│ View 2
/// │......│
/// 2 views, refer to │......│
/// small portions of a └──────┘
/// large buffer
/// ```
///
///
/// After GC:
///
/// ```text
/// ┌────────────────────┐ ┌─────┐ After gc, only
/// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
/// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
/// │ View 2 │─ ─ ─ ─ └─────┘ the views is
/// └────────────────────┘ left
///
///
/// 2 views
/// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
/// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
/// │ View 2 │─ ─ ─ ─ └─────┘ the views is
/// └────────────────────┘ left
///
///
/// 2 views
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ooops… my editor is set to trim trailing whitspace on save. Let me know if you want me to reverse.

/// ```
/// This method will compact the data buffers by recreating the view array and only include the data
/// that is pointed to by the views.
Expand Down Expand Up @@ -575,6 +575,15 @@ impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
self.views.is_empty()
}

fn shrink_to_fit(&mut self) {
self.views.shrink_to_fit();
self.buffers.iter_mut().for_each(|b| b.shrink_to_fit());
self.buffers.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
0
}
Expand Down
5 changes: 5 additions & 0 deletions arrow-array/src/array/dictionary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,11 @@ impl<T: ArrowDictionaryKeyType> Array for DictionaryArray<T> {
self.keys.is_empty()
}

fn shrink_to_fit(&mut self) {
self.keys.shrink_to_fit();
self.values.shrink_to_fit();
}

fn offset(&self) -> usize {
self.keys.offset()
}
Expand Down
7 changes: 7 additions & 0 deletions arrow-array/src/array/fixed_size_binary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,13 @@ impl Array for FixedSizeBinaryArray {
self.len == 0
}

fn shrink_to_fit(&mut self) {
self.value_data.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
0
}
Expand Down
7 changes: 7 additions & 0 deletions arrow-array/src/array/fixed_size_list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,13 @@ impl Array for FixedSizeListArray {
self.len == 0
}

fn shrink_to_fit(&mut self) {
self.values.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
0
}
Expand Down
8 changes: 8 additions & 0 deletions arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,14 @@ impl<OffsetSize: OffsetSizeTrait> Array for GenericListArray<OffsetSize> {
self.value_offsets.len() <= 1
}

fn shrink_to_fit(&mut self) {
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
self.values.shrink_to_fit();
self.value_offsets.shrink_to_fit();
}

fn offset(&self) -> usize {
0
}
Expand Down
9 changes: 9 additions & 0 deletions arrow-array/src/array/list_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,15 @@ impl<OffsetSize: OffsetSizeTrait> Array for GenericListViewArray<OffsetSize> {
self.value_sizes.is_empty()
}

fn shrink_to_fit(&mut self) {
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
self.values.shrink_to_fit();
self.value_offsets.shrink_to_fit();
self.value_sizes.shrink_to_fit();
}

fn offset(&self) -> usize {
0
}
Expand Down
8 changes: 8 additions & 0 deletions arrow-array/src/array/map_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,14 @@ impl Array for MapArray {
self.value_offsets.len() <= 1
}

fn shrink_to_fit(&mut self) {
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
self.entries.shrink_to_fit();
self.value_offsets.shrink_to_fit();
}

fn offset(&self) -> usize {
0
}
Expand Down
15 changes: 15 additions & 0 deletions arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// ```
fn is_empty(&self) -> bool;

/// Shrinks the capacity of any exclusively owned buffer as much as possible
///
/// Shared or externally allocated buffers will be ignored, and
/// any buffer offsets will be preserved.
fn shrink_to_fit(&mut self) {}

/// Returns the offset into the underlying data used by this array(-slice).
/// Note that the underlying data can be shared by many arrays.
/// This defaults to `0`.
Expand Down Expand Up @@ -365,6 +371,15 @@ impl Array for ArrayRef {
self.as_ref().is_empty()
}

/// For shared buffers, this is a no-op.
fn shrink_to_fit(&mut self) {
if let Some(slf) = Arc::get_mut(self) {
slf.shrink_to_fit();
} else {
// We ignore shared buffers.
}
}

fn offset(&self) -> usize {
self.as_ref().offset()
}
Expand Down
7 changes: 7 additions & 0 deletions arrow-array/src/array/primitive_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,13 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
self.values.is_empty()
}

fn shrink_to_fit(&mut self) {
self.values.shrink_to_fit();
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
}

fn offset(&self) -> usize {
0
}
Expand Down
5 changes: 5 additions & 0 deletions arrow-array/src/array/run_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,11 @@ impl<T: RunEndIndexType> Array for RunArray<T> {
self.run_ends.is_empty()
}

fn shrink_to_fit(&mut self) {
self.run_ends.shrink_to_fit();
self.values.shrink_to_fit();
}

fn offset(&self) -> usize {
self.run_ends.offset()
}
Expand Down
7 changes: 7 additions & 0 deletions arrow-array/src/array/struct_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,13 @@ impl Array for StructArray {
self.len == 0
}

fn shrink_to_fit(&mut self) {
if let Some(nulls) = &mut self.nulls {
nulls.shrink_to_fit();
}
self.fields.iter_mut().for_each(|n| n.shrink_to_fit());
}

fn offset(&self) -> usize {
0
}
Expand Down
11 changes: 11 additions & 0 deletions arrow-array/src/array/union_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,17 @@ impl Array for UnionArray {
self.type_ids.is_empty()
}

fn shrink_to_fit(&mut self) {
self.type_ids.shrink_to_fit();
if let Some(offsets) = &mut self.offsets {
offsets.shrink_to_fit();
}
for array in self.fields.iter_mut().flatten() {
array.shrink_to_fit();
}
self.fields.shrink_to_fit();
}

fn offset(&self) -> usize {
0
}
Expand Down
6 changes: 6 additions & 0 deletions arrow-buffer/src/buffer/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ impl BooleanBuffer {
self.len == 0
}

/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
// TODO(emilk): we could shrink even more in the case where we are a small sub-slice of the full buffer
self.buffer.shrink_to_fit();
}

/// Returns the boolean value at index `i`.
///
/// # Panics
Expand Down
63 changes: 63 additions & 0 deletions arrow-buffer/src/buffer/immutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,41 @@ impl Buffer {
self.data.capacity()
}

/// Tried to shrink the capacity of the buffer as much as possible, freeing unused memory.
///
/// If the buffer is shared, this is a no-op.
///
/// If the memory was allocated with a custom allocator, this is a no-op.
///
/// If the capacity is already less than or equal to the desired capacity, this is a no-op.
///
/// The memory region will be reallocated using `std::alloc::realloc`.
pub fn shrink_to_fit(&mut self) {
let offset = self.ptr_offset();
let is_empty = self.is_empty();
let desired_capacity = if is_empty {
0
} else {
// For realloc to work, we cannot free the elements before the offset
offset + self.len()
};
if desired_capacity < self.capacity() {
if let Some(bytes) = Arc::get_mut(&mut self.data) {
if bytes.try_realloc(desired_capacity).is_ok() {
// Realloc complete - update our pointer into `bytes`:
self.ptr = if is_empty {
bytes.as_ptr()
} else {
// SAFETY: we kept all elements leading up to the offset
unsafe { bytes.as_ptr().add(offset) }
}
} else {
// Failure to reallocate is fine; we just failed to free up memory.
}
}
}
}

/// Returns whether the buffer is empty.
#[inline]
pub fn is_empty(&self) -> bool {
Expand Down Expand Up @@ -554,6 +589,34 @@ mod tests {
assert_eq!(buf2.slice_with_length(2, 1).as_slice(), &[10]);
}

#[test]
fn test_shrink_to_fit() {
let original = Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7]);
assert_eq!(original.as_slice(), &[0, 1, 2, 3, 4, 5, 6, 7]);
assert_eq!(original.capacity(), 64);

let slice = original.slice_with_length(2, 3);
drop(original); // Make sure the buffer isn't shared (or shrink_to_fit won't work)
assert_eq!(slice.as_slice(), &[2, 3, 4]);
assert_eq!(slice.capacity(), 64);

let mut shrunk = slice;
shrunk.shrink_to_fit();
assert_eq!(shrunk.as_slice(), &[2, 3, 4]);
assert_eq!(shrunk.capacity(), 5); // shrink_to_fit is allowed to keep the elements before the offset

// Test that we can handle empty slices:
let empty_slice = shrunk.slice_with_length(1, 0);
drop(shrunk); // Make sure the buffer isn't shared (or shrink_to_fit won't work)
assert_eq!(empty_slice.as_slice(), &[]);
assert_eq!(empty_slice.capacity(), 5);

let mut shrunk_empty = empty_slice;
shrunk_empty.shrink_to_fit();
assert_eq!(shrunk_empty.as_slice(), &[]);
assert_eq!(shrunk_empty.capacity(), 1); // NOTE: `Buffer` and `Bytes` doesn't support 0-capacity
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They should do, but IIRC you need to use a dangling ptr, there should be some examples of this...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For MutableBuffer there is special handling for the size=0 case, with a dangling_ptr helper. We could copy all that logic to Bytes, but I rather not add all that complexity in this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added in #6817

}

#[test]
#[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")]
fn test_slice_offset_out_of_bound() {
Expand Down
5 changes: 5 additions & 0 deletions arrow-buffer/src/buffer/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ impl NullBuffer {
self.buffer.is_empty()
}

/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
self.buffer.shrink_to_fit();
}

/// Returns the null count for this [`NullBuffer`]
#[inline]
pub fn null_count(&self) -> usize {
Expand Down
5 changes: 5 additions & 0 deletions arrow-buffer/src/buffer/offset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ impl<O: ArrowNativeType> OffsetBuffer<O> {
Self(out.into())
}

/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
self.0.shrink_to_fit();
}

/// Returns the inner [`ScalarBuffer`]
pub fn inner(&self) -> &ScalarBuffer<O> {
&self.0
Expand Down
6 changes: 6 additions & 0 deletions arrow-buffer/src/buffer/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,12 @@ where
self.len == 0
}

/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
// TODO(emilk): we could shrink even more in the case where we are a small sub-slice of the full buffer
self.run_ends.shrink_to_fit();
}

/// Returns the values of this [`RunEndBuffer`] not including any offset
#[inline]
pub fn values(&self) -> &[E] {
Expand Down
Loading
Loading