-
Notifications
You must be signed in to change notification settings - Fork 174
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Support intersect all and except distinct/all in DataFrame API #3537
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
use std::{iter::repeat, sync::Arc}; | ||
|
||
use arrow2::offset::OffsetsBuffer; | ||
use arrow2::offset::{Offsets, OffsetsBuffer}; | ||
use common_error::DaftResult; | ||
use indexmap::{ | ||
map::{raw_entry_v1::RawEntryMut, RawEntryApiV1}, | ||
|
@@ -255,6 +255,31 @@ fn list_sort_helper_fixed_size( | |
.collect() | ||
} | ||
|
||
fn general_list_fill_helper(element: &Series, num_array: &Int64Array) -> DaftResult<Vec<Series>> { | ||
let num_iter = create_iter(num_array, element.len()); | ||
let mut result = vec![]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we preallocate the capacity here? let mut result = Vec::with_capacity(...) |
||
let element_data = element.as_physical()?; | ||
for (row_index, num) in num_iter.enumerate() { | ||
let list_arr = if element.is_valid(row_index) { | ||
let mut list_growable = make_growable( | ||
element.name(), | ||
element.data_type(), | ||
vec![&element_data], | ||
false, | ||
num as usize, | ||
); | ||
for _ in 0..num { | ||
list_growable.extend(0, row_index, 1); | ||
} | ||
list_growable.build()? | ||
} else { | ||
Series::full_null(element.name(), element.data_type(), num as usize) | ||
}; | ||
result.push(list_arr); | ||
} | ||
Ok(result) | ||
} | ||
|
||
impl ListArray { | ||
pub fn value_counts(&self) -> DaftResult<MapArray> { | ||
struct IndexRef { | ||
|
@@ -625,6 +650,25 @@ impl ListArray { | |
self.validity().cloned(), | ||
)) | ||
} | ||
|
||
pub fn list_fill(elem: &Series, num_array: &Int64Array) -> DaftResult<Self> { | ||
let generated = general_list_fill_helper(elem, num_array)?; | ||
let generated_refs: Vec<&Series> = generated.iter().collect(); | ||
let lengths = generated.iter().map(|arr| arr.len()); | ||
let offsets = Offsets::try_from_lengths(lengths)?; | ||
let flat_child = if generated_refs.is_empty() { | ||
// when there's no output, we should create an empty series | ||
Series::empty(elem.name(), elem.data_type()) | ||
} else { | ||
Series::concat(&generated_refs)? | ||
}; | ||
Ok(Self::new( | ||
elem.field().to_list_field()?, | ||
flat_child, | ||
offsets.into(), | ||
None, | ||
)) | ||
} | ||
} | ||
|
||
impl FixedSizeListArray { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could we add a couple tests for listfill in this file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated, PTAL. |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,63 @@ | ||||||
use common_error::{DaftError, DaftResult}; | ||||||
use daft_core::{ | ||||||
datatypes::{DataType, Field}, | ||||||
prelude::{Schema, Series}, | ||||||
}; | ||||||
use daft_dsl::{ | ||||||
functions::{ScalarFunction, ScalarUDF}, | ||||||
ExprRef, | ||||||
}; | ||||||
use serde::{Deserialize, Serialize}; | ||||||
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] | ||||||
pub struct ListFill {} | ||||||
|
||||||
#[typetag::serde] | ||||||
impl ScalarUDF for ListFill { | ||||||
fn as_any(&self) -> &dyn std::any::Any { | ||||||
self | ||||||
} | ||||||
|
||||||
fn name(&self) -> &'static str { | ||||||
"fill" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
|
||||||
} | ||||||
|
||||||
fn to_field(&self, inputs: &[ExprRef], schema: &Schema) -> DaftResult<Field> { | ||||||
match inputs { | ||||||
[n, elem] => { | ||||||
let num_field = n.to_field(schema)?; | ||||||
let elem_field = elem.to_field(schema)?; | ||||||
if !num_field.dtype.is_integer() { | ||||||
return Err(DaftError::TypeError(format!( | ||||||
"Expected num field to be of numeric type, received: {}", | ||||||
num_field.dtype | ||||||
))); | ||||||
} | ||||||
elem_field.to_list_field() | ||||||
} | ||||||
_ => Err(DaftError::SchemaMismatch(format!( | ||||||
"Expected 2 input args, got {}", | ||||||
inputs.len() | ||||||
))), | ||||||
} | ||||||
} | ||||||
|
||||||
fn evaluate(&self, inputs: &[Series]) -> DaftResult<Series> { | ||||||
match inputs { | ||||||
[num, elem] => { | ||||||
let num = num.cast(&DataType::Int64)?; | ||||||
let num_array = num.i64()?; | ||||||
elem.list_fill(num_array) | ||||||
} | ||||||
_ => Err(DaftError::ValueError(format!( | ||||||
"Expected 2 input args, got {}", | ||||||
inputs.len() | ||||||
))), | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
#[must_use] | ||||||
pub fn list_fill(n: ExprRef, elem: ExprRef) -> ExprRef { | ||||||
ScalarFunction::new(ListFill {}, vec![n, elem]).into() | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if there's a better name for this, it would be really appreciated.