Skip to content

Commit

Permalink
fix: Properly handle from_physical for List/Array (#20311)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Dec 16, 2024
1 parent ad38583 commit 9521d97
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 26 deletions.
45 changes: 45 additions & 0 deletions crates/polars-core/src/chunked_array/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,51 @@ impl ArrayChunked {
fld.coerce(DataType::Array(Box::new(inner_dtype), width))
}

/// Convert a non-logical [`ArrayChunked`] back into a logical [`ArrayChunked`] without casting.
///
/// # Safety
///
/// This can lead to invalid memory access in downstream code.
pub unsafe fn from_physical_unchecked(&self, to_inner_dtype: DataType) -> PolarsResult<Self> {
debug_assert!(!self.inner_dtype().is_logical());

let chunks = self
.downcast_iter()
.map(|chunk| chunk.values())
.cloned()
.collect();

let inner = unsafe {
Series::from_chunks_and_dtype_unchecked(PlSmallStr::EMPTY, chunks, self.inner_dtype())
};
let inner = unsafe { inner.from_physical_unchecked(&to_inner_dtype) }?;

let chunks: Vec<_> = self
.downcast_iter()
.zip(inner.into_chunks())
.map(|(chunk, values)| {
FixedSizeListArray::new(
ArrowDataType::FixedSizeList(
Box::new(ArrowField::new(
PlSmallStr::from_static("item"),
values.dtype().clone(),
true,
)),
self.width(),
),
chunk.len(),
values,
chunk.validity().cloned(),
)
.to_boxed()
})
.collect();

let name = self.name().clone();
let dtype = DataType::Array(Box::new(to_inner_dtype), self.width());
Ok(unsafe { Self::from_chunks_and_dtype_unchecked(name, chunks, dtype) })
}

/// Get the inner values as `Series`
pub fn get_inner(&self) -> Series {
let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect();
Expand Down
49 changes: 49 additions & 0 deletions crates/polars-core/src/chunked_array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,55 @@ impl ListChunked {
fld.coerce(DataType::List(Box::new(inner_dtype)))
}

/// Convert a non-logical [`ListChunked`] back into a logical [`ListChunked`] without casting.
///
/// # Safety
///
/// This can lead to invalid memory access in downstream code.
pub unsafe fn from_physical_unchecked(
&self,
to_inner_dtype: DataType,
) -> PolarsResult<ListChunked> {
debug_assert!(!self.inner_dtype().is_logical());

let inner_chunks = self
.downcast_iter()
.map(|chunk| chunk.values())
.cloned()
.collect();

let inner = unsafe {
Series::from_chunks_and_dtype_unchecked(
PlSmallStr::EMPTY,
inner_chunks,
self.inner_dtype(),
)
};
let inner = unsafe { inner.from_physical_unchecked(&to_inner_dtype) }?;

let chunks: Vec<_> = self
.downcast_iter()
.zip(inner.into_chunks())
.map(|(chunk, values)| {
LargeListArray::new(
ArrowDataType::LargeList(Box::new(ArrowField::new(
PlSmallStr::from_static("item"),
values.dtype().clone(),
true,
))),
chunk.offsets().clone(),
values,
chunk.validity().cloned(),
)
.to_boxed()
})
.collect();

let name = self.name().clone();
let dtype = DataType::List(Box::new(to_inner_dtype));
Ok(unsafe { ListChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) })
}

/// Get the inner values as [`Series`], ignoring the list offsets.
pub fn get_inner(&self) -> Series {
let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect();
Expand Down
28 changes: 28 additions & 0 deletions crates/polars-core/src/chunked_array/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,34 @@ impl StructChunked {
constructor(name, length, new_fields.iter())
}

/// Convert a non-logical [`StructChunked`] back into a logical [`StructChunked`] without casting.
///
/// # Safety
///
/// This can lead to invalid memory access in downstream code.
pub unsafe fn from_physical_unchecked(
&self,
to_fields: &[Field],
) -> PolarsResult<StructChunked> {
if cfg!(debug_assertions) {
for f in self.struct_fields() {
assert!(!f.dtype().is_logical());
}
}

let length = self.len();
let fields = self
.fields_as_series()
.iter()
.zip(to_fields)
.map(|(f, to)| unsafe { f.from_physical_unchecked(to.dtype()) })
.collect::<PolarsResult<Vec<_>>>()?;

let mut out = StructChunked::from_series(self.name().clone(), length, fields.iter())?;
out.zip_outer_validity(self);
Ok(out)
}

pub fn struct_fields(&self) -> &[Field] {
let DataType::Struct(fields) = self.dtype() else {
unreachable!()
Expand Down
63 changes: 37 additions & 26 deletions crates/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -547,34 +547,25 @@ impl Series {
},
(D::Int64, D::Time) => feature_gated!("dtype-time", Ok(self.clone().into_time())),

(D::List(_), D::List(to)) => Ok(self
.list()
.unwrap()
.apply_to_inner(&|inner| unsafe { inner.from_physical_unchecked(to) })?
.into_series()),
(D::List(_), D::List(to)) => unsafe {
self.list()
.unwrap()
.from_physical_unchecked(to.as_ref().clone())
.map(|ca| ca.into_series())
},
#[cfg(feature = "dtype-array")]
(D::Array(_, lw), D::Array(to, rw)) if lw == rw => Ok(self
.array()
.unwrap()
.apply_to_inner(&|inner| unsafe { inner.from_physical_unchecked(to) })?
.into_series()),
(D::Array(_, lw), D::Array(to, rw)) if lw == rw => unsafe {
self.array()
.unwrap()
.from_physical_unchecked(to.as_ref().clone())
.map(|ca| ca.into_series())
},
#[cfg(feature = "dtype-struct")]
(D::Struct(_), D::Struct(to)) => {
let slf = self.struct_().unwrap();

let length = slf.len();

let fields = slf
.fields_as_series()
.iter()
.zip(to)
.map(|(f, to)| unsafe { f.from_physical_unchecked(to.dtype()) })
.collect::<PolarsResult<Vec<_>>>()?;

let mut out =
StructChunked::from_series(slf.name().clone(), length, fields.iter())?;
out.zip_outer_validity(slf);
Ok(out.into_series())
(D::Struct(_), D::Struct(to)) => unsafe {
self.struct_()
.unwrap()
.from_physical_unchecked(to.as_slice())
.map(|ca| ca.into_series())
},

_ => panic!("invalid from_physical({dtype:?}) for {:?}", self.dtype()),
Expand Down Expand Up @@ -1203,6 +1194,26 @@ mod test {
let _ = ca.into_series();
}

#[test]
#[cfg(feature = "dtype-date")]
fn roundtrip_list_logical_20311() {
let list = ListChunked::from_chunk_iter(
PlSmallStr::from_static("a"),
[ListArray::new(
ArrowDataType::LargeList(Box::new(ArrowField::new(
PlSmallStr::from_static("item"),
ArrowDataType::Int32,
true,
))),
unsafe { Offsets::new_unchecked(vec![0, 1]) }.into(),
PrimitiveArray::new(ArrowDataType::Int32, vec![1i32].into(), None).to_boxed(),
None,
)],
);
let list = unsafe { list.from_physical_unchecked(DataType::Date) }.unwrap();
assert_eq!(list.dtype(), &DataType::List(Box::new(DataType::Date)));
}

#[test]
#[cfg(feature = "dtype-struct")]
fn new_series_from_empty_structs() {
Expand Down

0 comments on commit 9521d97

Please sign in to comment.