Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into altair
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli committed Aug 26, 2024
2 parents ea018b5 + 6f5851d commit 40a0e31
Show file tree
Hide file tree
Showing 264 changed files with 7,364 additions and 3,704 deletions.
311 changes: 201 additions & 110 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ improvements point to the `main` branch of this repo.
polars = { git = "https://github.com/pola-rs/polars", rev = "<optional git tag>" }
```

Requires Rust version `>=1.79`.
Requires Rust version `>=1.80`.

## Contributing

Expand Down
5 changes: 2 additions & 3 deletions crates/polars-arrow/src/array/binary/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -442,9 +442,8 @@ impl<O: Offset, T: AsRef<[u8]>> TryPush<Option<T>> for MutableBinaryArray<O> {
Some(value) => {
self.values.try_push(value.as_ref())?;

match &mut self.validity {
Some(validity) => validity.push(true),
None => {},
if let Some(validity) = &mut self.validity {
validity.push(true)
}
},
None => {
Expand Down
173 changes: 173 additions & 0 deletions crates/polars-arrow/src/array/binview/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,128 @@ impl MutableBinaryViewArray<[u8]> {
}
Ok(())
}

/// Extend from a `buffer` and `length` of items given some statistics about the lengths.
///
/// This will attempt to dispatch to several optimized implementations.
///
/// # Safety
///
/// This is safe if the statistics are correct.
pub unsafe fn extend_from_lengths_with_stats(
&mut self,
buffer: &[u8],
lengths_iterator: impl Clone + ExactSizeIterator<Item = usize>,
min_length: usize,
max_length: usize,
sum_length: usize,
) {
let num_items = lengths_iterator.len();

if num_items == 0 {
return;
}

#[cfg(debug_assertions)]
{
let (min, max, sum) = lengths_iterator.clone().map(|v| (v, v, v)).fold(
(usize::MAX, usize::MIN, 0usize),
|(cmin, cmax, csum), (emin, emax, esum)| {
(cmin.min(emin), cmax.max(emax), csum + esum)
},
);

assert_eq!(min, min_length);
assert_eq!(max, max_length);
assert_eq!(sum, sum_length);
}

assert!(sum_length <= buffer.len());

let mut buffer_offset = 0;
if min_length > View::MAX_INLINE_SIZE as usize
&& (num_items == 1 || sum_length + self.in_progress_buffer.len() <= u32::MAX as usize)
{
let buffer_idx = self.completed_buffers().len() as u32;
let in_progress_buffer_offset = self.in_progress_buffer.len();

self.in_progress_buffer
.extend_from_slice(&buffer[..sum_length]);
self.views.extend(lengths_iterator.map(|length| {
// SAFETY: We asserted before that the sum of all lengths is smaller or equal to
// the buffer length.
let view_buffer =
unsafe { buffer.get_unchecked(buffer_offset..buffer_offset + length) };

// SAFETY: We know that the minimum length > View::MAX_INLINE_SIZE. Therefore, this
// length is > View::MAX_INLINE_SIZE.
let view = unsafe {
View::new_noninline_unchecked(
view_buffer,
buffer_idx,
(buffer_offset + in_progress_buffer_offset) as u32,
)
};
buffer_offset += length;
view
}));
} else if max_length <= View::MAX_INLINE_SIZE as usize {
// If the min and max are the same, we can dispatch to the optimized SIMD
// implementation.
if min_length == max_length {
let length = min_length;
if length == 0 {
self.views
.resize(self.views.len() + num_items, View::new_inline(&[]));
} else {
View::extend_with_inlinable_strided(
&mut self.views,
&buffer[..length * num_items],
length as u8,
);
}
} else {
self.views.extend(lengths_iterator.map(|length| {
// SAFETY: We asserted before that the sum of all lengths is smaller or equal
// to the buffer length.
let view_buffer =
unsafe { buffer.get_unchecked(buffer_offset..buffer_offset + length) };

// SAFETY: We know that each view has a length <= View::MAX_INLINE_SIZE because
// the maximum length is <= View::MAX_INLINE_SIZE
let view = unsafe { View::new_inline_unchecked(view_buffer) };
buffer_offset += length;
view
}));
}
} else {
// If all fails, just fall back to a base implementation.
self.reserve(num_items);
for length in lengths_iterator {
let value = &buffer[buffer_offset..buffer_offset + length];
buffer_offset += length;
self.push_value(value);
}
}
}

/// Extend from a `buffer` and `length` of items.
///
/// This will attempt to dispatch to several optimized implementations.
#[inline]
pub fn extend_from_lengths(
&mut self,
buffer: &[u8],
lengths_iterator: impl Clone + ExactSizeIterator<Item = usize>,
) {
let (min, max, sum) = lengths_iterator.clone().map(|v| (v, v, v)).fold(
(usize::MAX, 0usize, 0usize),
|(cmin, cmax, csum), (emin, emax, esum)| (cmin.min(emin), cmax.max(emax), csum + esum),
);

// SAFETY: We just collected the right stats.
unsafe { self.extend_from_lengths_with_stats(buffer, lengths_iterator, min, max, sum) }
}
}

impl<T: ViewType + ?Sized, P: AsRef<T>> Extend<Option<P>> for MutableBinaryViewArray<T> {
Expand Down Expand Up @@ -646,3 +768,54 @@ impl<T: ViewType + ?Sized, P: AsRef<T>> TryPush<Option<P>> for MutableBinaryView
Ok(())
}
}

#[cfg(test)]
mod tests {
use super::*;

fn roundtrip(values: &[&[u8]]) -> bool {
let buffer = values
.iter()
.flat_map(|v| v.iter().copied())
.collect::<Vec<u8>>();
let lengths = values.iter().map(|v| v.len()).collect::<Vec<usize>>();
let mut bv = MutableBinaryViewArray::<[u8]>::with_capacity(values.len());

bv.extend_from_lengths(&buffer[..], lengths.into_iter());

&bv.values_iter().collect::<Vec<&[u8]>>()[..] == values
}

#[test]
fn extend_with_lengths_basic() {
assert!(roundtrip(&[]));
assert!(roundtrip(&[b"abc"]));
assert!(roundtrip(&[
b"a_very_very_long_string_that_is_not_inlinable"
]));
assert!(roundtrip(&[
b"abc",
b"a_very_very_long_string_that_is_not_inlinable"
]));
}

#[test]
fn extend_with_inlinable_fastpath() {
assert!(roundtrip(&[b"abc", b"defg", b"hix"]));
assert!(roundtrip(&[b"abc", b"defg", b"hix", b"xyza1234abcd"]));
}

#[test]
fn extend_with_inlinable_eq_len_fastpath() {
assert!(roundtrip(&[b"abc", b"def", b"hix"]));
assert!(roundtrip(&[b"abc", b"def", b"hix", b"xyz"]));
}

#[test]
fn extend_with_not_inlinable_fastpath() {
assert!(roundtrip(&[
b"a_very_long_string123",
b"a_longer_string_than_the_previous"
]));
}
}
17 changes: 8 additions & 9 deletions crates/polars-arrow/src/array/binview/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,12 +157,12 @@ impl View {
/// Extend a `Vec<View>` with inline views slices of `src` with `width`.
///
/// This tries to use SIMD to optimize the copying and can be massively faster than doing a
/// `views.extend(src.chunks_exact(stride).map(View::new_inline))`.
/// `views.extend(src.chunks_exact(width).map(View::new_inline))`.
///
/// # Panics
///
/// This function panics if `src.len()` is not divisible by `width` or if `width >
/// View::MAX_INLINE_SIZE`.
/// This function panics if `src.len()` is not divisible by `width`, `width >
/// View::MAX_INLINE_SIZE` or `width == 0`.
pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {
macro_rules! dispatch {
($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {
Expand All @@ -180,17 +180,16 @@ impl View {
}

let width = width as usize;
assert_eq!(src.len() % width, 0);

assert!(width > 0);
assert!(width <= View::MAX_INLINE_SIZE as usize);

assert_eq!(src.len() % width, 0);

let num_values = src.len() / width;

views.reserve(num_values);

if width == 0 {
views.resize(views.len() + num_values, View::new_inline(&[]));
return;
}

#[allow(unused_mut)]
let mut src = src;

Expand Down
5 changes: 2 additions & 3 deletions crates/polars-arrow/src/array/boolean/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,8 @@ impl MutableBooleanArray {
#[inline]
pub fn push_value(&mut self, value: bool) {
self.values.push(value);
match &mut self.validity {
Some(validity) => validity.push(true),
None => {},
if let Some(validity) = &mut self.validity {
validity.push(true)
}
}

Expand Down
5 changes: 2 additions & 3 deletions crates/polars-arrow/src/array/fixed_size_binary/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,8 @@ impl MutableFixedSizeBinaryArray {
}
self.values.extend_from_slice(bytes);

match &mut self.validity {
Some(validity) => validity.push(true),
None => {},
if let Some(validity) = &mut self.validity {
validity.push(true)
}
},
None => {
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ pub trait Array: Send + Sync + dyn_clone::DynClone + 'static {
dyn_clone::clone_trait_object!(Array);

/// A trait describing a mutable array; i.e. an array whose values can be changed.
///
/// Mutable arrays cannot be cloned but can be mutated in place,
/// thereby making them useful to perform numeric operations without allocations.
/// As in [`Array`], concrete arrays (such as [`MutablePrimitiveArray`]) implement how they are mutated.
Expand Down Expand Up @@ -370,6 +371,7 @@ pub fn new_empty_array(data_type: ArrowDataType) -> Box<dyn Array> {
}

/// Creates a new [`Array`] of [`ArrowDataType`] `data_type` and `length`.
///
/// The array is guaranteed to have [`Array::null_count`] equal to [`Array::len`]
/// for all types except Union, which does not have a validity.
pub fn new_null_array(data_type: ArrowDataType, length: usize) -> Box<dyn Array> {
Expand Down
5 changes: 2 additions & 3 deletions crates/polars-arrow/src/array/primitive/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,8 @@ impl<T: NativeType> MutablePrimitiveArray<T> {
#[inline]
pub fn push_value(&mut self, value: T) {
self.values.push(value);
match &mut self.validity {
Some(validity) => validity.push(true),
None => {},
if let Some(validity) = &mut self.validity {
validity.push(true)
}
}

Expand Down
5 changes: 2 additions & 3 deletions crates/polars-arrow/src/array/utf8/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -522,9 +522,8 @@ impl<O: Offset, T: AsRef<str>> TryPush<Option<T>> for MutableUtf8Array<O> {
Some(value) => {
self.values.try_push(value.as_ref())?;

match &mut self.validity {
Some(validity) => validity.push(true),
None => {},
if let Some(validity) = &mut self.validity {
validity.push(true)
}
},
None => {
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-arrow/src/bitmap/utils/slice_iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ enum State {
Finished,
}

/// Iterator over a bitmap that returns slices of set regions
/// Iterator over a bitmap that returns slices of set regions.
///
/// This is the most efficient method to extract slices of values from arrays
/// with a validity bitmap.
/// For example, the bitmap `00101111` returns `[(0,4), (6,1)]`
Expand Down
15 changes: 9 additions & 6 deletions crates/polars-arrow/src/compute/arity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ use crate::bitmap::{Bitmap, MutableBitmap};
use crate::datatypes::ArrowDataType;
use crate::types::NativeType;

/// Applies an unary and infallible function to a [`PrimitiveArray`]. This is the
/// fastest way to perform an operation on a [`PrimitiveArray`] when the benefits
/// of a vectorized operation outweighs the cost of branching nulls and
/// non-nulls.
/// Applies an unary and infallible function to a [`PrimitiveArray`].
///
/// This is the /// fastest way to perform an operation on a [`PrimitiveArray`] when the benefits
/// of a vectorized operation outweighs the cost of branching nulls and non-nulls.
///
/// # Implementation
/// This will apply the function for all values, including those on null slots.
Expand Down Expand Up @@ -131,11 +131,14 @@ where
PrimitiveArray::<O>::new(data_type, values, validity)
}

/// Applies a binary operations to two primitive arrays. This is the fastest
/// way to perform an operation on two primitive array when the benefits of a
/// Applies a binary operations to two primitive arrays.
///
/// This is the fastest way to perform an operation on two primitive array when the benefits of a
/// vectorized operation outweighs the cost of branching nulls and non-nulls.
///
/// # Errors
/// This function errors iff the arrays have a different length.
///
/// # Implementation
/// This will apply the function for all values, including those on null slots.
/// This implies that the operation must be infallible for any value of the
Expand Down
8 changes: 7 additions & 1 deletion crates/polars-arrow/src/compute/temporal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,33 +75,38 @@ macro_rules! date_like {
}

/// Extracts the years of a temporal array as [`PrimitiveArray<i32>`].
///
/// Use [`can_year`] to check if this operation is supported for the target [`ArrowDataType`].
pub fn year(array: &dyn Array) -> PolarsResult<PrimitiveArray<i32>> {
date_like!(year, array, ArrowDataType::Int32)
}

/// Extracts the months of a temporal array as [`PrimitiveArray<i8>`].
///
/// Value ranges from 1 to 12.
/// Use [`can_month`] to check if this operation is supported for the target [`ArrowDataType`].
pub fn month(array: &dyn Array) -> PolarsResult<PrimitiveArray<i8>> {
date_like!(month, array, ArrowDataType::Int8)
}

/// Extracts the days of a temporal array as [`PrimitiveArray<i8>`].
///
/// Value ranges from 1 to 32 (Last day depends on month).
/// Use [`can_day`] to check if this operation is supported for the target [`ArrowDataType`].
pub fn day(array: &dyn Array) -> PolarsResult<PrimitiveArray<i8>> {
date_like!(day, array, ArrowDataType::Int8)
}

/// Extracts weekday of a temporal array as [`PrimitiveArray<i8>`].
///
/// Monday is 1, Tuesday is 2, ..., Sunday is 7.
/// Use [`can_weekday`] to check if this operation is supported for the target [`ArrowDataType`]
pub fn weekday(array: &dyn Array) -> PolarsResult<PrimitiveArray<i8>> {
date_like!(i8_weekday, array, ArrowDataType::Int8)
}

/// Extracts ISO week of a temporal array as [`PrimitiveArray<i8>`]
/// Extracts ISO week of a temporal array as [`PrimitiveArray<i8>`].
///
/// Value ranges from 1 to 53 (Last week depends on the year).
/// Use [`can_iso_week`] to check if this operation is supported for the target [`ArrowDataType`]
pub fn iso_week(array: &dyn Array) -> PolarsResult<PrimitiveArray<i8>> {
Expand Down Expand Up @@ -161,6 +166,7 @@ pub fn second(array: &dyn Array) -> PolarsResult<PrimitiveArray<i8>> {
}

/// Extracts the nanoseconds of a temporal array as [`PrimitiveArray<i32>`].
///
/// Value ranges from 0 to 1_999_999_999.
/// The range from 1_000_000_000 to 1_999_999_999 represents the leap second.
/// Use [`can_nanosecond`] to check if this operation is supported for the target [`ArrowDataType`].
Expand Down
Loading

0 comments on commit 40a0e31

Please sign in to comment.