From c096172b769da8003ea7816086df60f38229a891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Wed, 31 Jan 2024 11:35:57 +0100 Subject: [PATCH] Track the memory usage of custom allocations so that their size can be reported via Array::get_buffer_memory_size (#5347) --- arrow-array/src/array/mod.rs | 13 +++++++++++++ arrow-buffer/src/alloc/mod.rs | 21 ++++++++++++++++++--- arrow-buffer/src/buffer/immutable.rs | 2 +- arrow-buffer/src/buffer/scalar.rs | 2 +- arrow-buffer/src/bytes.rs | 13 +++++++------ 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index f19406c1610b..1a58598543f8 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -272,6 +272,8 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the total number of bytes of memory pointed to by this array. /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map. + /// Note that this does not always correspond to the exact memory usage of an array, + /// since multiple arrays can share the same buffers or slices thereof. fn get_buffer_memory_size(&self) -> usize; /// Returns the total number of bytes of memory occupied physically by this array. @@ -934,6 +936,17 @@ mod tests { ); } + #[test] + fn test_memory_size_primitive_sliced() { + let arr = PrimitiveArray::::from_iter_values(0..128); + let slice1 = arr.slice(0, 64); + let slice2 = arr.slice(64, 64); + + // both slices report the full buffer memory usage, even though the buffers are shared + assert_eq!(slice1.get_array_memory_size(), arr.get_array_memory_size()); + assert_eq!(slice2.get_array_memory_size(), arr.get_array_memory_size()); + } + #[test] fn test_memory_size_primitive_nullable() { let arr: PrimitiveArray = (0..128) diff --git a/arrow-buffer/src/alloc/mod.rs b/arrow-buffer/src/alloc/mod.rs index a3cb6253f324..d7108d2969bb 100644 --- a/arrow-buffer/src/alloc/mod.rs +++ b/arrow-buffer/src/alloc/mod.rs @@ -38,7 +38,9 @@ pub(crate) enum Deallocation { Standard(Layout), /// An allocation from an external source like the FFI interface /// Deallocation will happen on `Allocation::drop` - Custom(Arc), + /// The size of the allocation is tracked here separately only + /// for memory usage reporting via `Array::get_buffer_memory_size` + Custom(Arc, usize), } impl Debug for Deallocation { @@ -47,9 +49,22 @@ impl Debug for Deallocation { Deallocation::Standard(layout) => { write!(f, "Deallocation::Standard {layout:?}") } - Deallocation::Custom(_) => { - write!(f, "Deallocation::Custom {{ capacity: unknown }}") + Deallocation::Custom(_, size) => { + write!(f, "Deallocation::Custom {{ capacity: {size} }}") } } } } + +#[cfg(test)] +mod tests { + use crate::alloc::Deallocation; + + #[test] + fn test_size_of_deallocation() { + assert_eq!( + std::mem::size_of::(), + 3 * std::mem::size_of::() + ); + } +} diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 9db8732f3611..f7fc7cffdc73 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -124,7 +124,7 @@ impl Buffer { len: usize, owner: Arc, ) -> Self { - Buffer::build_with_arguments(ptr, len, Deallocation::Custom(owner)) + Buffer::build_with_arguments(ptr, len, Deallocation::Custom(owner, len)) } /// Auxiliary method to create a new Buffer diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index f1c2ae785720..3826d74e43bd 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -134,7 +134,7 @@ impl From for ScalarBuffer { is_aligned, "Memory pointer is not aligned with the specified scalar type" ), - Deallocation::Custom(_) => + Deallocation::Custom(_, _) => assert!(is_aligned, "Memory pointer from external source (e.g, FFI) is not aligned with the specified scalar type. Before importing buffer through FFI, please make sure the allocation is aligned."), } diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index 81860b604868..ba61342d8e39 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -90,9 +90,9 @@ impl Bytes { pub fn capacity(&self) -> usize { match self.deallocation { Deallocation::Standard(layout) => layout.size(), - // we cannot determine this in general, - // and thus we state that this is externally-owned memory - Deallocation::Custom(_) => 0, + // we only know the size of the custom allocation + // its underlying capacity might be larger + Deallocation::Custom(_, size) => size, } } @@ -116,7 +116,7 @@ impl Drop for Bytes { _ => unsafe { std::alloc::dealloc(self.ptr.as_ptr(), *layout) }, }, // The automatic drop implementation will free the memory once the reference count reaches zero - Deallocation::Custom(_allocation) => (), + Deallocation::Custom(_allocation, _size) => (), } } } @@ -147,10 +147,11 @@ impl Debug for Bytes { impl From for Bytes { fn from(value: bytes::Bytes) -> Self { + let len = value.len(); Self { - len: value.len(), + len, ptr: NonNull::new(value.as_ptr() as _).unwrap(), - deallocation: Deallocation::Custom(std::sync::Arc::new(value)), + deallocation: Deallocation::Custom(std::sync::Arc::new(value), len), } } }