diff --git a/crates/sparrow-compiler/src/functions.rs b/crates/sparrow-compiler/src/functions.rs index 5aa988431..a1009692d 100644 --- a/crates/sparrow-compiler/src/functions.rs +++ b/crates/sparrow-compiler/src/functions.rs @@ -9,7 +9,6 @@ mod implementation; mod json; mod logical; mod math; -mod pushdown; mod registry; mod string; mod time; @@ -18,7 +17,6 @@ mod window; pub use function::*; use implementation::*; -pub(crate) use pushdown::*; pub use registry::*; pub use time_domain_check::*; diff --git a/crates/sparrow-compiler/src/functions/aggregation.rs b/crates/sparrow-compiler/src/functions/aggregation.rs index deca779dc..54771204f 100644 --- a/crates/sparrow-compiler/src/functions/aggregation.rs +++ b/crates/sparrow-compiler/src/functions/aggregation.rs @@ -1,7 +1,5 @@ -use anyhow::Context; - use crate::functions::time_domain_check::TimeDomainCheck; -use crate::functions::{Implementation, Pushdown, Registry}; +use crate::functions::{Implementation, Registry}; /// The `is_new` pattern used for basic aggregations. const AGGREGATION_IS_NEW: &str = "(logical_or ?window_is_new ?input_is_new)"; @@ -124,39 +122,11 @@ pub(super) fn register(registry: &mut Registry) { .with_dfg_signature( "last(input: T, window: bool = null, duration: i64 = null) -> T", ) - .with_implementation(Implementation::Pushdown(Box::new( - Pushdown::try_new( - 0, - &format!( - "(last ({}) ({}) ({}))", - "transform (if ?is_new ?input_value) (merge_join ?op ?window_op)", - "?window_value", - "?duration_value" - ), - // The per-field pattern produces the last value of the field. - // The outer if and last is handling the case where the latest *record* - // contained a null value for the field, by only using the last value of - // the field if the record is new and valid and the input field is valid in - // that record. - &format!( - "(if (last ({}) ({}) ({})) ?recurse_on_input_field)", - "transform (if (logical_and ?is_new (is_valid ?input_record)) (is_valid \ - ?input_field)) (merge_join ?op ?window_op)", - "?window_value", - "?duration_value" - ), - // The result pattern treats the resulting record as `null` if there haven't - // been any new non-null records observed. Eg., requires the count to be > 0. - &format!( - "(if (gt (count_if ({}) ({}) ({})) 0u32) ?result_record)", - "transform (logical_and ?is_new (is_valid ?input_record)) (merge_join ?op \ - ?window_op)", - "?window_value", - "?duration_value" - ), - ) - .context("last") - .unwrap(), + .with_implementation(Implementation::new_pattern(&format!( + "(last ({}) ({}) ({}))", + "transform (if ?input_is_new ?input_value) (merge_join ?input_op ?window_op)", + "?window_value", + "?duration_value" ))) .with_is_new(Implementation::new_pattern(AGGREGATION_IS_NEW)) .with_time_domain_check(TimeDomainCheck::Aggregation); @@ -166,39 +136,11 @@ pub(super) fn register(registry: &mut Registry) { .with_dfg_signature( "first(input: T, window: bool = null, duration: i64 = null) -> T", ) - .with_implementation(Implementation::Pushdown(Box::new( - Pushdown::try_new( - 0, - &format!( - "(first({}) ({}) ({}))", - "transform (if ?is_new ?input_value) (merge_join ?op ?window_op)", - "?window_value", - "?duration_value" - ), - // The per-field pattern produces the last value of the field. - // The outer if and last is handling the case where the latest *record* - // contained a null value for the field, by only using the last value of - // the field if the record is new and valid and the input field is valid in - // that record. - &format!( - "(if (first ({}) ({}) ({})) ?recurse_on_input_field)", - "transform (if (logical_and ?is_new (is_valid ?input_record)) (is_valid \ - ?input_field)) (merge_join ?op ?window_op)", - "?window_value", - "?duration_value" - ), - // The result pattern treats the resulting record as `null` if there haven't - // been any new non-null records observed. Eg., requires the count to be > 0. - &format!( - "(if (gt (count_if ({}) ({}) ({})) 0u32) ?result_record)", - "transform (logical_and ?is_new (is_valid ?input_record)) (merge_join ?op \ - ?window_op)", - "?window_value", - "?duration_value" - ), - ) - .context("first") - .unwrap(), + .with_implementation(Implementation::new_pattern(&format!( + "(first ({}) ({}) ({}))", + "transform (if ?input_is_new ?input_value) (merge_join ?input_op ?window_op)", + "?window_value", + "?duration_value" ))) .with_is_new(Implementation::new_pattern(AGGREGATION_IS_NEW)) .with_time_domain_check(TimeDomainCheck::Aggregation); diff --git a/crates/sparrow-compiler/src/functions/implementation.rs b/crates/sparrow-compiler/src/functions/implementation.rs index 012682fc7..04b79538d 100644 --- a/crates/sparrow-compiler/src/functions/implementation.rs +++ b/crates/sparrow-compiler/src/functions/implementation.rs @@ -1,18 +1,18 @@ use std::str::FromStr; use anyhow::{anyhow, Context}; -use egg::{Id, Var}; +use egg::Id; use itertools::izip; use once_cell::sync::OnceCell; use smallvec::smallvec; use sparrow_api::kaskada::v1alpha::operation_plan::tick_operation::TickBehavior; use sparrow_instructions::InstOp; -use sparrow_syntax::{Expr, FeatureSetPart, FenlType, Located, ResolvedExpr, WindowBehavior}; +use sparrow_syntax::{Expr, FeatureSetPart, Located, ResolvedExpr, WindowBehavior}; use crate::ast_to_dfg::ast_to_dfg; use crate::dfg::{Dfg, DfgPattern, Operation, StepKind}; use crate::frontend::resolve_arguments::resolve_recursive; -use crate::functions::{Function, Pushdown}; +use crate::functions::Function; use crate::{is_any_new, AstDfgRef, DataContext, DiagnosticCollector}; /// Enum describing how a function is implemented. @@ -26,9 +26,6 @@ pub(super) enum Implementation { Window(WindowBehavior), /// The function should be expanded using the given pattern. Pattern(DfgPattern), - /// The function should be expanded on primitive fields using the given - /// pushdown. - Pushdown(Box), /// The function should be rewritten as the given fenl expression. /// /// This differs from `Rewrite` in that this expression uses fenl syntax and @@ -140,37 +137,6 @@ impl Implementation { Ok(result.value()) } - Implementation::Pushdown(pushdown) => { - // To avoid accidents, we don't include the "driving" argument in the - // substitution. Specifically, the "input" to the pushdown will - // be changed at each recursion. - let mut subst = - function.create_subst_from_args(dfg, args, Some(pushdown.pushdown_on())); - - let pushdown_on = &args[pushdown.pushdown_on()]; - // Add an `is_new` that indicates whether the argument being pushed down on - // was new. We can't access the `is_new` of individual components. - subst.insert( - Var::from_str("?is_new").context("Failed to parse ?is_new")?, - pushdown_on.is_new(), - ); - subst.insert( - Var::from_str("?op").context("Failed to parse ?op")?, - dfg.operation(pushdown_on.value()), - ); - - match pushdown_on.value_type() { - FenlType::Concrete(data_type) => { - pushdown.pushdown(dfg, &subst, pushdown_on.value(), data_type) - } - FenlType::Error => Ok(dfg.error_node().value()), - non_concrete => Err(anyhow!( - "Unable to pushdown '{}' on non-concrete type {}", - function.name(), - non_concrete - )), - } - } Implementation::AnyInputIsNew => Ok(is_any_new(dfg, args)?), } } diff --git a/crates/sparrow-compiler/src/functions/pushdown.rs b/crates/sparrow-compiler/src/functions/pushdown.rs deleted file mode 100644 index 09b23fd24..000000000 --- a/crates/sparrow-compiler/src/functions/pushdown.rs +++ /dev/null @@ -1,183 +0,0 @@ -use std::str::FromStr; - -use anyhow::{anyhow, Context}; -use arrow::datatypes::{DataType, FieldRef}; -use egg::{Id, Subst, Var}; -use smallvec::smallvec; -use sparrow_instructions::InstKind; - -use crate::dfg::{ChildrenVec, Dfg, DfgPattern, Expression}; - -/// Implements pushdown of instructions to the fields of records. -/// -/// Used to implement operations such as `last` and `first` on records -/// in terms of `last` and `first` on the individual fields. -#[derive(Debug)] -pub(crate) struct Pushdown { - /// The argument that the pushdown should be performed on. - /// - /// This is used to find the type that the pushdown is performed on. - pushdown_on: usize, - /// The pattern to use on primitive types. - primitive_pattern: DfgPattern, - /// The pattern to use on each field of a record. - record_field_pattern: DfgPattern, - /// The pattern to use after assembling a record containing the pushed-down - /// fields. - record_result_pattern: DfgPattern, -} - -impl Pushdown { - /// Configure how an operation is pushed down to primitives. - /// - /// Configured with the parameter whose type should "drive" the pushdown and - /// the patterns to use for pushing the operation to a record (fields and - /// final result). - /// - /// The primitive pattern may reference `?input` to get the value of the - /// input. - /// - /// The `record_field_pattern` may reference `input_record` and - /// `input_field` to get the original input record and the field being - /// worked on, respectively. - /// - /// The `record_result_pattern` may reference `input_record` and - /// `result_record` to get the original input record and the newly - /// created pushed-down version. - /// - /// All arguments other than the argument used for `pushdown_on` may also be - /// referenced by name, as `?name`. - pub(super) fn try_new( - pushdown_on: usize, - primitive_pattern: &str, - record_field_pattern: &str, - record_result_pattern: &str, - ) -> anyhow::Result { - let primitive_pattern = - DfgPattern::from_str(primitive_pattern).context("primitive_pattern")?; - let record_field_pattern = - DfgPattern::from_str(record_field_pattern).context("record_field_pattern")?; - let recurse_on_input_field = - Var::from_str("?recurse_on_input_field").context("valid var")?; - - // Make sure we recursively expand things. If this isn't true, then the - // pattern only applies the rewrites one level deep. - anyhow::ensure!( - record_field_pattern.references(&recurse_on_input_field), - "Record field pattern must include ?recurse_on_input_field" - ); - - let record_result_pattern = - DfgPattern::from_str(record_result_pattern).context("record_result_pattern")?; - - Ok(Self { - pushdown_on, - primitive_pattern, - record_field_pattern, - record_result_pattern, - }) - } - - pub(super) fn pushdown_on(&self) -> usize { - self.pushdown_on - } - - pub(super) fn pushdown( - &self, - dfg: &mut Dfg, - subst: &Subst, - value: Id, - value_type: &DataType, - ) -> anyhow::Result { - match value_type { - DataType::Boolean - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - | DataType::Timestamp(_, _) - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Duration(_) - | DataType::Interval(_) - | DataType::Utf8 - | DataType::LargeUtf8 - | DataType::List(..) - | DataType::Map(..) => { - let mut subst = subst.clone(); - subst.insert( - Var::from_str("?input_value").context("Failed to parse ?input_value")?, - value, - ); - dfg.add_pattern(&self.primitive_pattern, &subst) - } - DataType::Struct(fields) => { - let fields = fields.clone(); - self.pushdown_struct(dfg, subst, value, fields.as_ref()) - } - unsupported => Err(anyhow!("Pushdown operation on type {:?}", unsupported)), - } - } - - fn pushdown_struct( - &self, - dfg: &mut Dfg, - subst: &Subst, - record: Id, - fields: &[FieldRef], - ) -> anyhow::Result { - let mut args = ChildrenVec::with_capacity(fields.len() * 2); - - let mut field_subst = subst.clone(); - - let field_var = Var::from_str("?input_field").context("Failed to parse ?input_field")?; - let recursive_var = Var::from_str("?recurse_on_input_field") - .context("Failed to parse ?recurse_on_input_field")?; - field_subst.insert( - Var::from_str("?input_record").context("Failed to parse ?input_record")?, - record, - ); - - for field in fields { - // T.a - let field_name = dfg.add_string_literal(field.name())?; - - let field_ref = dfg.add_expression( - Expression::Inst(InstKind::FieldRef), - smallvec![record, field_name], - )?; - - field_subst.insert(field_var, field_ref); - - let recurse = self.pushdown(dfg, subst, field_ref, field.data_type())?; - field_subst.insert(recursive_var, recurse); - - args.push(field_name); - args.push(dfg.add_pattern(&self.record_field_pattern, &field_subst)?); - } - - // The result record. - let result_record = dfg.add_expression(Expression::Inst(InstKind::Record), args)?; - - let mut result_subst = subst.clone(); - result_subst.insert( - Var::from_str("?input_record").context("Failed to parse ?input_record")?, - record, - ); - result_subst.insert( - Var::from_str("?result_record").context("Failed to parse ?result_record")?, - result_record, - ); - - dfg.add_pattern(&self.record_result_pattern, &result_subst) - } -} diff --git a/crates/sparrow-compiler/tests/snapshots/compiler_golden_tests__projection_pushdown.snap b/crates/sparrow-compiler/tests/snapshots/compiler_golden_tests__projection_pushdown.snap index 329b4d0bf..dbdad9956 100644 --- a/crates/sparrow-compiler/tests/snapshots/compiler_golden_tests__projection_pushdown.snap +++ b/crates/sparrow-compiler/tests/snapshots/compiler_golden_tests__projection_pushdown.snap @@ -48,123 +48,63 @@ operations: - arguments: [] result_type: kind: - Primitive: 1 + Primitive: 14 output: false operator: Literal: - literal: ~ + literal: + Utf8: amount - arguments: [] result_type: kind: - Primitive: 14 + Primitive: 1 output: false operator: Literal: - literal: - Utf8: amount + literal: ~ - arguments: - 0 - 3 + - 3 result_type: kind: - Primitive: 13 - output: false - operator: - Instruction: field_ref - - arguments: - - 4 - - 2 - - 2 - result_type: - kind: - Primitive: 13 + Struct: + fields: + - name: sender + data_type: + kind: + Primitive: 10 + nullable: true + - name: amount + data_type: + kind: + Primitive: 13 + nullable: true + - name: receiver + data_type: + kind: + Primitive: 10 + nullable: true + - name: store + data_type: + kind: + Primitive: 10 + nullable: true output: false operator: Instruction: first - arguments: - 4 - result_type: - kind: - Primitive: 2 - output: false - operator: - Instruction: is_valid - - arguments: - - 0 - result_type: - kind: - Primitive: 2 - output: false - operator: - Instruction: is_valid - - arguments: - - 7 - - 6 - result_type: - kind: - Primitive: 2 - output: false - operator: - Instruction: if - - arguments: - - 8 - - 2 - 2 - result_type: - kind: - Primitive: 2 - output: false - operator: - Instruction: first - - arguments: - - 9 - - 5 result_type: kind: Primitive: 13 output: false operator: - Instruction: if - - arguments: [] - result_type: - kind: - Primitive: 9 - output: false - operator: - Literal: - literal: - Uint32: 0 + Instruction: field_ref - arguments: - - 7 - 2 - - 2 - result_type: - kind: - Primitive: 9 - output: false - operator: - Instruction: count_if - - arguments: - - 12 - - 11 - result_type: - kind: - Primitive: 2 - output: false - operator: - Instruction: gt - - arguments: - - 13 - - 10 - result_type: - kind: - Primitive: 13 - output: false - operator: - Instruction: if - - arguments: - - 3 - - 14 + - 5 result_type: kind: Struct: @@ -178,7 +118,7 @@ operations: operator: Instruction: record - arguments: - - 15 + - 6 result_type: kind: Primitive: 24 @@ -186,7 +126,7 @@ operations: operator: Instruction: time_of - arguments: - - 16 + - 7 - 1 result_type: kind: @@ -242,7 +182,7 @@ operations: input_column: 3 interpolation: 2 column: - ProducerExpression: 15 + ProducerExpression: 6 operator: Select: input: 0 @@ -251,7 +191,7 @@ operations: input_column: 4 interpolation: 1 column: - ProducerExpression: 17 + ProducerExpression: 8 primary_grouping: account primary_grouping_key_type: kind: diff --git a/crates/sparrow-instructions/src/evaluators.rs b/crates/sparrow-instructions/src/evaluators.rs index f5c84248a..8f58eca1b 100644 --- a/crates/sparrow-instructions/src/evaluators.rs +++ b/crates/sparrow-instructions/src/evaluators.rs @@ -220,19 +220,7 @@ fn create_simple_evaluator( InstOp::Exp => { create_float_evaluator!(&info.args[0].data_type, ExpEvaluator, info) } - InstOp::First => { - create_typed_evaluator!( - &info.args[0].data_type, - ArrowAggEvaluator, - UnsupportedEvaluator, - FirstListEvaluator, - FirstMapEvaluator, - FirstBooleanEvaluator, - FirstStringEvaluator, - FirstPrimitive, - info - ) - } + InstOp::First => aggregation::FirstEvaluator::try_new(info), InstOp::Flatten => FlattenEvaluator::try_new(info), InstOp::Floor => FloorEvaluator::try_new(info), InstOp::Get => GetEvaluator::try_new(info), @@ -272,19 +260,7 @@ fn create_simple_evaluator( // rely on simplification for conversion. InstOp::Json => anyhow::bail!("No evaluator defined for json function"), InstOp::JsonField => JsonFieldEvaluator::try_new(info), - InstOp::Last => { - create_typed_evaluator!( - &info.args[0].data_type, - ArrowAggEvaluator, - UnsupportedEvaluator, - LastListEvaluator, - LastMapEvaluator, - LastBooleanEvaluator, - LastStringEvaluator, - LastPrimitive, - info - ) - } + InstOp::Last => aggregation::LastEvaluator::try_new(info), InstOp::Len => LenEvaluator::try_new(info), InstOp::ListLen => ListLenEvaluator::try_new(info), InstOp::LogicalAnd => LogicalAndKleeneEvaluator::try_new(info), diff --git a/crates/sparrow-instructions/src/evaluators/aggregation.rs b/crates/sparrow-instructions/src/evaluators/aggregation.rs index 0ce7c1bce..f62c2c7f7 100644 --- a/crates/sparrow-instructions/src/evaluators/aggregation.rs +++ b/crates/sparrow-instructions/src/evaluators/aggregation.rs @@ -1,21 +1,17 @@ -mod boolean; +mod first_evaluator; mod function; mod generic; -mod list; -mod map; +mod last_evaluator; mod numeric_properties; mod primitive; -mod string; mod token; mod two_stacks; -pub use boolean::*; +pub use first_evaluator::*; pub use function::*; pub use generic::*; -pub use list::*; -pub use map::*; +pub use last_evaluator::*; pub use numeric_properties::*; pub use primitive::*; -pub use string::*; pub use token::*; pub use two_stacks::*; diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/boolean.rs b/crates/sparrow-instructions/src/evaluators/aggregation/boolean.rs deleted file mode 100644 index 87c03782a..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/boolean.rs +++ /dev/null @@ -1,10 +0,0 @@ -//! Boolean aggregation evaluators. - -mod first_boolean_evaluator; -mod last_boolean_evaluator; - -mod two_stacks_first_boolean_evaluator; -mod two_stacks_last_boolean_evaluator; - -pub use first_boolean_evaluator::*; -pub use last_boolean_evaluator::*; diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/boolean/first_boolean_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/boolean/first_boolean_evaluator.rs deleted file mode 100644 index 7ad8155db..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/boolean/first_boolean_evaluator.rs +++ /dev/null @@ -1,383 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use arrow::array::{Array, ArrayRef, BooleanArray, UInt32Array}; -use itertools::izip; -use sparrow_arrow::downcast::downcast_boolean_array; - -use super::two_stacks_first_boolean_evaluator::TwoStacksFirstBooleanEvaluator; -use crate::{ - AggregationArgs, BooleanAccumToken, Evaluator, EvaluatorFactory, RuntimeInfo, StateToken, - StaticInfo, TwoStacksBooleanAccumToken, -}; - -/// Evaluator for the `First` instruction on booleans. -pub struct FirstBooleanEvaluator { - args: AggregationArgs, - token: BooleanAccumToken, -} - -impl Evaluator for FirstBooleanEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::NoWindow { input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let result = Self::aggregate( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ); - result - } - AggregationArgs::Since { ticks, input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let ticks = info.value(ticks)?.boolean_array()?; - let result = Self::aggregate_since( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ticks.as_ref(), - ); - result - } - AggregationArgs::Sliding { .. } => { - unreachable!("Expected Non-windowed or Since windowed aggregation, saw Sliding.") - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl EvaluatorFactory for FirstBooleanEvaluator { - fn try_new(info: StaticInfo<'_>) -> anyhow::Result> { - let args = AggregationArgs::from_input(info.args)?; - match args { - AggregationArgs::NoWindow { .. } | AggregationArgs::Since { .. } => { - let token = BooleanAccumToken::default(); - Ok(Box::new(Self { token, args })) - } - AggregationArgs::Sliding { .. } => { - let token = TwoStacksBooleanAccumToken::new(); - Ok(Box::new(TwoStacksFirstBooleanEvaluator { token, args })) - } - } - } -} - -impl FirstBooleanEvaluator { - fn ensure_entity_capacity(token: &mut BooleanAccumToken, len: usize) { - token.resize(len); - } - - /// Updates the non-windowed accumulator based on the given flags. - /// - /// Implements a single row of the logic so that we can easily reuse it. - /// We choose to inline this so that it can be specialized in cases where - /// the valid bits are always true. - #[inline] - fn update_accum( - token: &mut BooleanAccumToken, - entity_index: u32, - input_is_valid: bool, - input: bool, - ) -> anyhow::Result> { - let value_to_emit = if let Some(value) = token.get_optional_value(entity_index)? { - Some(value) - } else if input_is_valid { - token.put_optional_value(entity_index, Some(input))?; - Some(input) - } else { - None - }; - - Ok(value_to_emit) - } - - /// Updates the since-windowed accumulator based on the given flags. - /// - /// Accumulator behavior is to update -> emit -> reset, resulting in - /// exclusive start bounds and inclusive end bounds. - /// - /// Implements a single row of the logic so that we can easily reuse it. - /// We choose to inline this so that it can be specialized in cases where - /// the valid bits are always true. - #[inline] - fn update_since_accum( - token: &mut BooleanAccumToken, - entity_index: u32, - input_is_valid: bool, - since_is_valid: bool, - input: bool, - since_bool: bool, - ) -> anyhow::Result> { - let value_to_emit = if let Some(value) = token.get_optional_value(entity_index)? { - Some(value) - } else if input_is_valid { - token.put_optional_value(entity_index, Some(input))?; - Some(input) - } else { - None - }; - - if since_is_valid && since_bool { - token.put_optional_value(entity_index, None)?; - } - - Ok(value_to_emit) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate( - token: &mut BooleanAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &BooleanArray = downcast_boolean_array(input.as_ref())?; - - // Make sure the internal buffers are large enough for the accumulators we may - // want to store. - Self::ensure_entity_capacity(token, key_capacity); - - let result: BooleanArray = if let Some(input_valid_bits) = input.nulls() { - izip!(key_indices.values(), input_valid_bits, 0..) - .map(|(entity_index, input_is_valid, input_index)| { - Self::update_accum( - token, - *entity_index, - input_is_valid, - input.value(input_index), - ) - }) - .collect::>()? - } else { - izip!(key_indices.values(), 0..) - .map(|(entity_index, input_index)| { - Self::update_accum(token, *entity_index, true, input.value(input_index)) - }) - .collect::>()? - }; - - Ok(Arc::new(result)) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Window Behavior - /// This aggregation uses the `since` window behavior, which takes a single - /// predicate. If the predicate evaluates to true, the accumulated value is - /// reset. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate_since( - token: &mut BooleanAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - window_since: &BooleanArray, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &BooleanArray = downcast_boolean_array(input.as_ref())?; - - // Make sure the internal buffers are large enough for the accumulators we may - // want to store. - Self::ensure_entity_capacity(token, key_capacity); - - let result: BooleanArray = match (input.nulls(), window_since.nulls()) { - (None, None) => izip!(key_indices.values(), 0.., window_since.values().iter(),) - .map(|(entity_index, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - true, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), None) => izip!( - key_indices.values(), - input_valid_bits, - 0.., - window_since.values().iter() - ) - .map(|(entity_index, input_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - input_is_valid, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (None, Some(window_valid_bits)) => izip!( - key_indices.values(), - window_valid_bits, - 0.., - window_since.values().iter() - ) - .map(|(entity_index, since_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - true, - since_is_valid, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), Some(window_valid_bits)) => izip!( - key_indices.values(), - input_valid_bits, - window_valid_bits, - 0.., - window_since.values().iter() - ) - .map( - |(entity_index, input_is_valid, since_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - input_is_valid, - since_is_valid, - input.value(input_index), - since_bool, - ) - }, - ) - .collect::>()?, - }; - - Ok(Arc::new(result)) - } -} - -#[cfg(test)] -mod tests { - - use super::*; - - #[test] - fn test_boolean_first_with_no_null() { - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1, 0]); - let input: ArrayRef = Arc::new(BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(false), - Some(false), - Some(true), - ])); - let mut token = BooleanAccumToken::default(); - - let output = - FirstBooleanEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); - assert_eq!( - downcast_boolean_array(output.as_ref()).unwrap(), - &BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - Some(true), - Some(false) - ]) - ); - } - - #[test] - fn test_boolean_first_with_null() { - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1, 0]); - let input: ArrayRef = Arc::new(BooleanArray::from(vec![ - Some(false), - Some(true), - None, - None, - Some(false), - None, - ])); - let mut token = BooleanAccumToken::default(); - - let output = - FirstBooleanEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); - assert_eq!( - downcast_boolean_array(output.as_ref()).unwrap(), - &BooleanArray::from(vec![ - Some(false), - Some(true), - None, - Some(true), - Some(true), - Some(false) - ]) - ); - - // And another round (to make sure values carry over) - let entity_indices = UInt32Array::from(vec![0, 1, 1, 2, 3, 0]); - let input: ArrayRef = Arc::new(BooleanArray::from(vec![ - None, - None, - Some(true), - Some(true), - None, - Some(true), - ])); - - let output = - FirstBooleanEvaluator::aggregate(&mut token, 4, &entity_indices, &input).unwrap(); - assert_eq!( - downcast_boolean_array(output.as_ref()).unwrap(), - &BooleanArray::from(vec![ - Some(false), - Some(true), - Some(true), - Some(true), - None, - Some(false) - ]) - ); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/boolean/last_boolean_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/boolean/last_boolean_evaluator.rs deleted file mode 100644 index 83d4fcbe0..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/boolean/last_boolean_evaluator.rs +++ /dev/null @@ -1,421 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use arrow::array::{Array, ArrayRef, BooleanArray, UInt32Array}; -use itertools::izip; -use sparrow_arrow::downcast::downcast_boolean_array; - -use super::two_stacks_last_boolean_evaluator::TwoStacksLastBooleanEvaluator; -use crate::{ - AggregationArgs, BooleanAccumToken, Evaluator, EvaluatorFactory, RuntimeInfo, StateToken, - StaticInfo, TwoStacksBooleanAccumToken, -}; - -/// Evaluator for the `last` instruction on booleans. -pub struct LastBooleanEvaluator { - args: AggregationArgs, - token: BooleanAccumToken, -} - -impl Evaluator for LastBooleanEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::NoWindow { input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let result = Self::aggregate( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ); - result - } - AggregationArgs::Since { ticks, input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let ticks = info.value(ticks)?.boolean_array()?; - let result = Self::aggregate_since( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ticks.as_ref(), - ); - result - } - AggregationArgs::Sliding { .. } => { - unreachable!("Expected Non-windowed or Since windowed aggregation, saw Sliding.") - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl EvaluatorFactory for LastBooleanEvaluator { - fn try_new(info: StaticInfo<'_>) -> anyhow::Result> { - let args = AggregationArgs::from_input(info.args)?; - match args { - AggregationArgs::NoWindow { .. } | AggregationArgs::Since { .. } => { - let token = BooleanAccumToken::default(); - Ok(Box::new(Self { token, args })) - } - AggregationArgs::Sliding { .. } => { - let token = TwoStacksBooleanAccumToken::new(); - Ok(Box::new(TwoStacksLastBooleanEvaluator { token, args })) - } - } - } -} - -impl LastBooleanEvaluator { - fn ensure_entity_capacity(token: &mut BooleanAccumToken, len: usize) { - token.resize(len); - } - - /// Updates the non-windowed accumulator based on the given flags. - /// - /// Implements a single row of the logic so that we can easily reuse it. - /// We choose to inline this so that it can be specialized in cases where - /// the valid bits are always true. - #[inline] - fn update_accum( - token: &mut BooleanAccumToken, - entity_index: u32, - input_is_valid: bool, - input: bool, - ) -> anyhow::Result> { - let value_to_emit = if input_is_valid { - token.put_optional_value(entity_index, Some(input))?; - Some(input) - } else { - token.get_optional_value(entity_index)? - }; - - Ok(value_to_emit) - } - - /// Updates the since-windowed accumulator based on the given flags. - /// - /// Accumulator behavior is to update -> emit -> reset, resulting in - /// exclusive start bounds and inclusive end bounds. - /// - /// Implements a single row of the logic so that we can easily reuse it. - /// We choose to inline this so that it can be specialized in cases where - /// the valid bits are always true. - #[inline] - fn update_since_accum( - token: &mut BooleanAccumToken, - entity_index: u32, - input_is_valid: bool, - since_is_valid: bool, - input: bool, - since_bool: bool, - ) -> anyhow::Result> { - let value_to_emit = if input_is_valid { - token.put_optional_value(entity_index, Some(input))?; - Some(input) - } else { - token.get_optional_value(entity_index)? - }; - - if since_is_valid && since_bool { - token.put_optional_value(entity_index, None)?; - } - - Ok(value_to_emit) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate( - token: &mut BooleanAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &BooleanArray = downcast_boolean_array(input.as_ref())?; - - // Make sure the internal buffers are large enough for the accumulators we may - // want to store. - Self::ensure_entity_capacity(token, key_capacity); - - let result: BooleanArray = if let Some(input_valid_bits) = input.nulls() { - izip!(key_indices.values(), input_valid_bits, 0..) - .map(|(entity_index, input_is_valid, input_index)| { - Self::update_accum( - token, - *entity_index, - input_is_valid, - input.value(input_index), - ) - }) - .collect::>()? - } else { - izip!(key_indices.values(), 0..) - .map(|(entity_index, input_index)| { - Self::update_accum(token, *entity_index, true, input.value(input_index)) - }) - .collect::>()? - }; - - Ok(Arc::new(result)) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Window Behavior - /// This aggregation uses the `since` window behavior, which takes a single - /// predicate. If the predicate evaluates to true, the accumulated value is - /// reset. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate_since( - token: &mut BooleanAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - window_since: &BooleanArray, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &BooleanArray = downcast_boolean_array(input.as_ref())?; - - // Make sure the internal buffers are large enough for the accumulators we may - // want to store. - Self::ensure_entity_capacity(token, key_capacity); - - let result: BooleanArray = match (input.nulls(), window_since.nulls()) { - (None, None) => izip!(key_indices.values(), 0.., window_since.values().iter()) - .map(|(entity_index, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - true, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), None) => izip!( - key_indices.values(), - input_valid_bits, - 0.., - window_since.values().iter() - ) - .map(|(entity_index, input_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - input_is_valid, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (None, Some(window_valid_bits)) => izip!( - key_indices.values(), - window_valid_bits, - 0.., - window_since.values().iter() - ) - .map(|(entity_index, since_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - true, - since_is_valid, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), Some(window_valid_bits)) => izip!( - key_indices.values(), - input_valid_bits, - window_valid_bits, - 0.., - window_since.values().iter() - ) - .map( - |(entity_index, input_is_valid, since_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - input_is_valid, - since_is_valid, - input.value(input_index), - since_bool, - ) - }, - ) - .collect::>()?, - }; - - Ok(Arc::new(result)) - } -} - -#[cfg(test)] -mod tests { - - use super::*; - - #[test] - fn test_boolean_last_with_no_null() { - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); - let input: ArrayRef = Arc::new(BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - Some(false), - ])); - let mut token = BooleanAccumToken::default(); - - let output = - LastBooleanEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); - assert_eq!( - downcast_boolean_array(output.as_ref()).unwrap(), - &BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - Some(false) - ]) - ); - } - - #[test] - fn test_boolean_last_with_null() { - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); - let input: ArrayRef = Arc::new(BooleanArray::from(vec![ - Some(false), - Some(true), - None, - None, - Some(false), - ])); - let mut token = BooleanAccumToken::default(); - - let output = - LastBooleanEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); - - assert_eq!( - downcast_boolean_array(output.as_ref()).unwrap(), - &BooleanArray::from(vec![Some(false), Some(true), None, Some(true), Some(false)]) - ); - - // And another round (to make sure values carry over) - let entity_indices = UInt32Array::from(vec![0, 1, 1, 2, 3]); - let input: ArrayRef = Arc::new(BooleanArray::from(vec![ - None, - None, - Some(true), - Some(true), - None, - ])); - let output = - LastBooleanEvaluator::aggregate(&mut token, 4, &entity_indices, &input).unwrap(); - - assert_eq!( - downcast_boolean_array(output.as_ref()).unwrap(), - &BooleanArray::from(vec![Some(false), Some(false), Some(true), Some(true), None]) - ); - } - - #[test] - fn test_boolean_last_since_with_null() { - let entity_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); - let input: ArrayRef = Arc::new(BooleanArray::from(vec![ - Some(false), - Some(true), - None, - None, - Some(false), - ])); - let since = BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - Some(false), - ]); - let mut token = BooleanAccumToken::default(); - - let output = - LastBooleanEvaluator::aggregate_since(&mut token, 3, &entity_indices, &input, &since) - .unwrap(); - assert_eq!( - downcast_boolean_array(output.as_ref()).unwrap(), - &BooleanArray::from(vec![ - Some(false), - Some(true), - Some(true), - Some(true), - Some(false) - ]) - ); - - // And another round (to make sure values carry over) - let entity_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); - let input: ArrayRef = Arc::new(BooleanArray::from(vec![ - None, - None, - Some(true), - Some(true), - None, - ])); - let since = BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - Some(false), - ]); - - let output = - LastBooleanEvaluator::aggregate_since(&mut token, 4, &entity_indices, &input, &since) - .unwrap(); - assert_eq!( - downcast_boolean_array(output.as_ref()).unwrap(), - &BooleanArray::from(vec![Some(false), Some(false), Some(true), Some(true), None]) - ); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/boolean/two_stacks_first_boolean_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/boolean/two_stacks_first_boolean_evaluator.rs deleted file mode 100644 index 4294fa041..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/boolean/two_stacks_first_boolean_evaluator.rs +++ /dev/null @@ -1,268 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use anyhow::anyhow; -use arrow::array::{Array, ArrayRef, BooleanArray, UInt32Array}; -use arrow::datatypes::Int64Type; -use itertools::izip; -use sparrow_arrow::downcast::downcast_boolean_array; - -use crate::{ - AggregationArgs, Evaluator, FirstBoolean, RuntimeInfo, StateToken, TwoStacks, - TwoStacksBooleanAccumToken, -}; - -/// Evaluator for the `First` instruction on booleans. -pub(crate) struct TwoStacksFirstBooleanEvaluator { - pub args: AggregationArgs, - pub token: TwoStacksBooleanAccumToken, -} - -impl Evaluator for TwoStacksFirstBooleanEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::Sliding { - input, - ticks, - duration, - } => { - // Get the stored state of the accum - let mut accum = self.token.get_boolean_accum()?; - - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let ticks = info.value(ticks)?.boolean_array()?; - let duration = info - .value(duration)? - .try_primitive_literal::()? - .ok_or_else(|| anyhow!("Expected non-null literal duration"))?; - if duration <= 0 { - anyhow::bail!( - "Expected positive duration for sliding window, saw {:?}", - duration - ); - } - let result = Self::aggregate( - &mut accum, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - duration, - ticks.as_ref(), - ); - - // Store the new state - self.token.put_boolean_accum(accum)?; - - result - } - AggregationArgs::Since { .. } | AggregationArgs::NoWindow { .. } => { - unreachable!( - "Expected sliding-windowed aggregation, saw non-windowed or since windowed." - ) - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl TwoStacksFirstBooleanEvaluator { - fn ensure_entity_capacity( - accum: &mut Vec>, - entity_capacity: usize, - window_parts: i64, - ) { - if entity_capacity > accum.len() { - accum.resize(entity_capacity, TwoStacks::new(window_parts)); - } - } - - /// Updates the windowed accumulator based on the given flags. - /// - /// Accumulator behavior is to update -> emit -> reset, resulting in - /// exclusive start bounds and inclusive end bounds. - #[inline] - fn update_two_stacks_accum( - accum: &mut [TwoStacks], - entity_index: u32, - input_is_valid: bool, - sliding_is_valid: bool, - input: bool, - sliding: bool, - ) -> anyhow::Result> { - if input_is_valid { - accum[entity_index as usize].add_input(&input); - } - - let value_to_emit = accum[entity_index as usize].accum_value(); - - if sliding_is_valid && sliding { - accum[entity_index as usize].evict(); - } - - Ok(value_to_emit) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Window Behavior - /// This aggregation uses the `sliding` window behavior. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate( - accum: &mut Vec>, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - sliding_duration: i64, - sliding_window: &BooleanArray, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &BooleanArray = downcast_boolean_array(input.as_ref())?; - - Self::ensure_entity_capacity(accum, key_capacity, sliding_duration); - - let result: BooleanArray = match (input.nulls(), sliding_window.nulls()) { - (None, None) => izip!(key_indices.values(), 0.., sliding_window.values().iter()) - .map(|(entity_index, input_index, since_bool)| { - Self::update_two_stacks_accum( - accum, - *entity_index, - true, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), None) => izip!( - key_indices.values(), - input_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map(|(entity_index, input_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - accum, - *entity_index, - input_is_valid, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (None, Some(window_valid_bits)) => izip!( - key_indices.values(), - window_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map(|(entity_index, since_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - accum, - *entity_index, - true, - since_is_valid, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), Some(window_valid_bits)) => izip!( - key_indices.values(), - input_valid_bits, - window_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map( - |(entity_index, input_is_valid, since_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - accum, - *entity_index, - input_is_valid, - since_is_valid, - input.value(input_index), - since_bool, - ) - }, - ) - .collect::>()?, - }; - - Ok(Arc::new(result)) - } -} - -#[cfg(test)] -mod tests { - - use std::sync::Arc; - - use super::*; - - #[test] - fn test_sliding_boolean_first_with_no_null() { - // With no null values - let entity_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); - let input: ArrayRef = Arc::new(BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - Some(false), - ])); - let sliding = BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - Some(false), - ]); - let mut token = TwoStacksBooleanAccumToken::new(); - let mut accum = token.get_boolean_accum().unwrap(); - - let output = TwoStacksFirstBooleanEvaluator::aggregate( - &mut accum, - 1, - &entity_indices, - &input, - 2, - &sliding, - ) - .unwrap(); - - assert_eq!( - downcast_boolean_array(output.as_ref()).unwrap(), - &BooleanArray::from(vec![ - Some(false), - Some(false), - Some(true), - Some(true), - Some(false) - ]) - ); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/boolean/two_stacks_last_boolean_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/boolean/two_stacks_last_boolean_evaluator.rs deleted file mode 100644 index 5166be660..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/boolean/two_stacks_last_boolean_evaluator.rs +++ /dev/null @@ -1,263 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use anyhow::anyhow; -use arrow::array::{Array, ArrayRef, BooleanArray, UInt32Array}; -use arrow::datatypes::Int64Type; -use itertools::izip; -use sparrow_arrow::downcast::downcast_boolean_array; - -use crate::{ - AggregationArgs, Evaluator, LastBoolean, RuntimeInfo, StateToken, TwoStacks, - TwoStacksBooleanAccumToken, -}; - -/// Evaluator for the `last` instruction on booleans. -pub(crate) struct TwoStacksLastBooleanEvaluator { - pub args: AggregationArgs, - pub token: TwoStacksBooleanAccumToken, -} - -impl Evaluator for TwoStacksLastBooleanEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::Sliding { - input, - ticks, - duration, - } => { - // Get the stored state of the accum - let mut accum = self.token.get_boolean_accum()?; - - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let ticks = info.value(ticks)?.boolean_array()?; - let duration = info - .value(duration)? - .try_primitive_literal::()? - .ok_or_else(|| anyhow!("Expected non-null literal duration"))?; - if duration <= 0 { - anyhow::bail!( - "Expected positive duration for sliding window, saw {:?}", - duration - ); - } - let result = Self::aggregate( - &mut accum, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - duration, - ticks.as_ref(), - ); - // Store the new state - self.token.put_boolean_accum(accum)?; - - result - } - AggregationArgs::Since { .. } | AggregationArgs::NoWindow { .. } => { - unreachable!( - "Expected sliding-windowed aggregation, saw non-windowed or since windowed." - ) - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl TwoStacksLastBooleanEvaluator { - fn ensure_entity_capacity( - accum: &mut Vec>, - entity_capacity: usize, - window_parts: i64, - ) { - if entity_capacity > accum.len() { - accum.resize(entity_capacity, TwoStacks::new(window_parts)); - } - } - - /// Updates the windowed accumulator based on the given flags. - /// - /// Accumulator behavior is to update -> emit -> reset, resulting in - /// exclusive start bounds and inclusive end bounds. - #[inline] - fn update_two_stacks_accum( - accum: &mut [TwoStacks], - entity_index: u32, - input_is_valid: bool, - sliding_is_valid: bool, - input: bool, - sliding: bool, - ) -> anyhow::Result> { - if input_is_valid { - accum[entity_index as usize].add_input(&input); - } - - let value_to_emit = accum[entity_index as usize].accum_value(); - - if sliding_is_valid && sliding { - accum[entity_index as usize].evict(); - } - - Ok(value_to_emit) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Window Behavior - /// This aggregation uses the `sliding` window behavior. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate( - accum: &mut Vec>, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - sliding_duration: i64, - sliding_window: &BooleanArray, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &BooleanArray = downcast_boolean_array(input.as_ref())?; - - Self::ensure_entity_capacity(accum, key_capacity, sliding_duration); - - let result: BooleanArray = match (input.nulls(), sliding_window.nulls()) { - (None, None) => izip!(key_indices.values(), 0.., sliding_window.values().iter()) - .map(|(entity_index, input_index, since_bool)| { - Self::update_two_stacks_accum( - accum, - *entity_index, - true, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), None) => izip!( - key_indices.values(), - input_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map(|(entity_index, input_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - accum, - *entity_index, - input_is_valid, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (None, Some(window_valid_bits)) => izip!( - key_indices.values(), - window_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map(|(entity_index, since_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - accum, - *entity_index, - true, - since_is_valid, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), Some(window_valid_bits)) => izip!( - key_indices.values(), - input_valid_bits, - window_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map( - |(entity_index, input_is_valid, since_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - accum, - *entity_index, - input_is_valid, - since_is_valid, - input.value(input_index), - since_bool, - ) - }, - ) - .collect::>()?, - }; - - Ok(Arc::new(result)) - } -} - -#[cfg(test)] -mod tests { - - use super::*; - - #[test] - fn test_sliding_boolean_last_with_no_null() { - let entity_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); - let input: ArrayRef = Arc::new(BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - Some(false), - ])); - let sliding = BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - Some(false), - ]); - let mut token = TwoStacksBooleanAccumToken::new(); - let mut accum = token.get_boolean_accum().unwrap(); - - let output = TwoStacksLastBooleanEvaluator::aggregate( - &mut accum, - 1, - &entity_indices, - &input, - 2, - &sliding, - ) - .unwrap(); - assert_eq!( - downcast_boolean_array(output.as_ref()).unwrap(), - &BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - Some(true), - Some(false) - ]) - ); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/first_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/first_evaluator.rs new file mode 100644 index 000000000..2a8e9f5f2 --- /dev/null +++ b/crates/sparrow-instructions/src/evaluators/aggregation/first_evaluator.rs @@ -0,0 +1,410 @@ +use crate::ValueRef; +use crate::{ + AggregationArgs, ArrayRefAccumToken, Evaluator, EvaluatorFactory, RuntimeInfo, StateToken, + StaticInfo, +}; +use arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray, UInt32Array}; + +/// Evaluator for the `First` instruction. +pub struct FirstEvaluator { + args: AggregationArgs, + token: ArrayRefAccumToken, +} + +impl Evaluator for FirstEvaluator { + fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { + match &self.args { + AggregationArgs::NoWindow { input } => { + let grouping = info.grouping(); + let input_vals = info.value(input)?.array_ref()?; + let result = Self::aggregate( + &mut self.token, + grouping.num_groups(), + grouping.group_indices(), + &input_vals, + ); + + result + } + AggregationArgs::Since { ticks, input } => { + let grouping = info.grouping(); + let input_vals = info.value(input)?.array_ref()?; + let ticks = info.value(ticks)?.boolean_array()?; + let result = Self::aggregate_since( + &mut self.token, + grouping.num_groups(), + grouping.group_indices(), + &input_vals, + ticks.as_ref(), + ); + + result + } + AggregationArgs::Sliding { .. } => { + panic!("expected non-windowed or since-windowed aggregation, saw sliding.") + } + } + } + + fn state_token(&self) -> Option<&dyn StateToken> { + Some(&self.token) + } + + fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { + Some(&mut self.token) + } +} + +impl EvaluatorFactory for FirstEvaluator { + fn try_new(info: StaticInfo<'_>) -> anyhow::Result> { + let args = AggregationArgs::from_input(info.args)?; + match args { + AggregationArgs::NoWindow { .. } | AggregationArgs::Since { .. } => { + let token = ArrayRefAccumToken::empty(info.result_type); + Ok(Box::new(Self { token, args })) + } + AggregationArgs::Sliding { .. } => { + unimplemented!("sliding window aggregation over list unsupported") + } + } + } +} + +impl FirstEvaluator { + /// Resizes the accumulator to the new size. + fn ensure_entity_capacity(token: &mut ArrayRefAccumToken, len: usize) -> anyhow::Result<()> { + token.resize(len) + } + + /// Returns the existing value for an entity if it exists, or a new value from the + /// input if it exists, or null if neither. + /// + /// Takes advantage of the `take` and `concat` kernels to avoid having to type the + /// evaluator, keeping everything as ArrayRefs. + /// + /// The output is taken from the concatenated batch of the old state and the new input. + /// If the old state's value is null, then the take index for that entity is the length + /// of the old state plus the current index (i.e. the index into the new input). + /// If not, then we keep the take index as the old state's index. + fn aggregate( + token: &mut ArrayRefAccumToken, + key_capacity: usize, + key_indices: &UInt32Array, + input: &ArrayRef, + ) -> anyhow::Result { + Self::ensure_entity_capacity(token, key_capacity)?; + + let mut take_new_state: Vec = (0..token.accum.len() as u32).collect(); + let mut take_output_builder = UInt32Array::builder(input.len()); + for input_index in 0..input.len() { + let entity_index = key_indices.value(input_index) as usize; + if token.value_is_null(entity_index) && input.is_valid(input_index) { + // If the `take_new_state[entity_index]` is greater than the length, that + // means it has been set already, so we should not overwrite it. + // + // Note that we only check if the token was previously null. + let not_taken = take_new_state[entity_index] < take_new_state.len() as u32; + if not_taken { + take_new_state[entity_index] = (input_index + take_new_state.len()) as u32; + } + }; + + take_output_builder.append_value(take_new_state[entity_index]) + } + + // Gather the output, using the previous state and the new input + let output = + sparrow_arrow::concat_take(&token.accum, input, &take_output_builder.finish())?; + + // Update the state token with the new state + let take_new_state = PrimitiveArray::from_iter_values(take_new_state); + let new_state = sparrow_arrow::concat_take(&token.accum, input, &take_new_state)?; + token.set_state(new_state); + + Ok(output) + } + + /// Returns the existing value for an entity if it exists, or a new value from the + /// input if it exists, or null if neither. + /// + /// Takes advantage of the `take` and `concat` kernels to avoid having to type the + /// evaluator, keeping everything as ArrayRefs. + /// + /// The output is taken from the concatenated batch of the old state and the new input. + /// If the old state's value is null, then the take index for that entity is the length + /// of the old state plus the current index (i.e. the index into the new input). + /// If not, then we keep the take index as the old state's index. + fn aggregate_since( + token: &mut ArrayRefAccumToken, + key_capacity: usize, + key_indices: &UInt32Array, + input: &ArrayRef, + ticks: &BooleanArray, + ) -> anyhow::Result { + Self::ensure_entity_capacity(token, key_capacity)?; + + let mut take_new_state: Vec> = (0..token.accum.len()) + .map(|index| { + if token.accum.is_valid(index) { + Some(index as u32) + } else { + None + } + }) + .collect(); + + let mut take_output_builder = UInt32Array::builder(input.len()); + for input_index in 0..input.len() { + let entity_index = key_indices.value(input_index) as usize; + if input.is_valid(input_index) && take_new_state[entity_index].is_none() { + take_new_state[entity_index] = Some((input_index + take_new_state.len()) as u32); + }; + + take_output_builder.append_option(take_new_state[entity_index]); + + if ticks.value(input_index) && ticks.is_valid(input_index) { + take_new_state[entity_index] = None; + } + } + + // Gather the output, using the previous state and the new input + let output = + sparrow_arrow::concat_take(&token.accum, input, &take_output_builder.finish())?; + + // Update the state token with the new state + let take_new_state = PrimitiveArray::from_iter(take_new_state); + let new_state = sparrow_arrow::concat_take(&token.accum, input, &take_new_state)?; + token.set_state(new_state); + + Ok(output) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{AsArray, Float64Array, Int64Builder, ListBuilder, MapBuilder}; + use arrow_schema::{DataType, Field, Fields}; + use std::sync::Arc; + + fn empty_list_i64_token() -> ArrayRefAccumToken { + let f = Arc::new(Field::new("item", DataType::Int64, true)); + let list = DataType::List(f); + ArrayRefAccumToken::empty(&list) + } + + #[test] + fn test_first_list_multiple_batches() { + let mut token = empty_list_i64_token(); + let key_indices = UInt32Array::from(vec![0, 0, 0, 0, 0, 0]); + let key_capacity = 1; + + // Batch 1 + let mut builder = ListBuilder::new(Int64Builder::new()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append_value([Some(4), None, Some(5)]); + builder.append_value([None, None]); + builder.append(false); + builder.append_value([]); + builder.append_value([Some(7), Some(8), Some(9)]); + + let array = builder.finish(); + + let input: ArrayRef = Arc::new(array); + let result = + FirstEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); + let result = result.as_list(); + + let mut builder = ListBuilder::new(Int64Builder::new()); + for _ in 0..6 { + builder.append_value([Some(1), Some(2), Some(3)]); + } + let expected = builder.finish(); + + assert_eq!(&expected, result); + + // Batch 2 + let mut builder = ListBuilder::new(Int64Builder::new()); + builder.append_value([Some(10), Some(11)]); + builder.append(true); + builder.append_value([Some(13), None]); + builder.append(false); + builder.append(false); + builder.append_value([Some(14)]); + + let array = builder.finish(); + let input: ArrayRef = Arc::new(array); + + // Introduce more entities + let key_indices = UInt32Array::from(vec![0, 1, 2, 1, 0, 1]); + let key_capacity = 3; + let result = + FirstEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); + let result = result.as_list(); + + let mut builder = ListBuilder::new(Int64Builder::new()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append(true); + builder.append_value([Some(13), None]); + builder.append(true); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append(true); + let expected = builder.finish(); + + assert_eq!(&expected, result); + } + + fn empty_map_i64_token() -> ArrayRefAccumToken { + let k = Field::new("keys", DataType::Int64, false); + let v = Field::new("values", DataType::Int64, true); + let fields = Fields::from(vec![k, v]); + let s = Arc::new(Field::new("entries", DataType::Struct(fields), false)); + let map = DataType::Map(s, false); + ArrayRefAccumToken::empty(&map) + } + + #[test] + fn test_first_map_multiple_batches() { + let mut token = empty_map_i64_token(); + let key_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); + let key_capacity = 1; + + // Batch 1 + let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); + builder.keys().append_value(1); + builder.values().append_value(1); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(2); + builder.keys().append_value(2); + builder.values().append_value(4); + builder.append(true).unwrap(); + + builder.append(true).unwrap(); + + builder.keys().append_value(2); + builder.values().append_value(99); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(10); + builder.keys().append_value(3); + builder.values().append_value(7); + builder.append(true).unwrap(); + let array = builder.finish(); + + let input: ArrayRef = Arc::new(array); + let result = + FirstEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); + let result = result.as_map(); + + let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); + for _ in 0..5 { + builder.keys().append_value(1); + builder.values().append_value(1); + builder.append(true).unwrap(); + } + let expected = builder.finish(); + + assert_eq!(&expected, result); + + // Batch 2 + let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); + builder.keys().append_value(1); + builder.values().append_value(1); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(2); + builder.keys().append_value(2); + builder.values().append_value(4); + builder.append(true).unwrap(); + + builder.append(true).unwrap(); + + builder.keys().append_value(2); + builder.values().append_value(99); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(10); + builder.keys().append_value(3); + builder.values().append_value(7); + builder.append(true).unwrap(); + + let array = builder.finish(); + let input: ArrayRef = Arc::new(array); + + // Introduce second entity key + let key_indices = UInt32Array::from(vec![0, 1, 0, 1, 0]); + let key_capacity = 2; + let result = + FirstEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); + let result = result.as_map(); + + let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); + builder.keys().append_value(1); + builder.values().append_value(1); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(2); + builder.keys().append_value(2); + builder.values().append_value(4); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(1); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(2); + builder.keys().append_value(2); + builder.values().append_value(4); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(1); + builder.append(true).unwrap(); + let expected = builder.finish(); + + assert_eq!(&expected, result); + } + + #[test] + fn test_first_f64() { + let mut token = ArrayRefAccumToken::empty(&DataType::Float64); + let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); + let input: ArrayRef = Arc::new(Float64Array::from(vec![ + Some(1.0), + Some(2.0), + None, + None, + Some(3.0), + ])); + + let output = FirstEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); + + let output: &Float64Array = output.as_primitive(); + assert_eq!( + output, + &Float64Array::from(vec![Some(1.0), Some(2.0), None, Some(2.0), Some(2.0)]) + ); + + let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); + let input: ArrayRef = Arc::new(Float64Array::from(vec![ + None, + Some(4.0), + Some(5.0), + None, + None, + ])); + let output = FirstEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); + let output: &Float64Array = output.as_primitive(); + assert_eq!( + output, + &Float64Array::from(vec![Some(1.0), Some(2.0), Some(5.0), Some(2.0), Some(2.0)]) + ); + } +} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/function.rs b/crates/sparrow-instructions/src/evaluators/aggregation/function.rs index ad4e3b50d..a7132e6b8 100644 --- a/crates/sparrow-instructions/src/evaluators/aggregation/function.rs +++ b/crates/sparrow-instructions/src/evaluators/aggregation/function.rs @@ -1,13 +1,9 @@ //! Aggregation functions. pub mod agg_fn; -pub mod boolean_agg_fn; pub mod count_agg_fn; pub mod primitive_agg_fn; -pub mod string_agg_fn; pub use agg_fn::*; -pub use boolean_agg_fn::*; pub use count_agg_fn::*; pub use primitive_agg_fn::*; -pub use string_agg_fn::*; diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/function/boolean_agg_fn.rs b/crates/sparrow-instructions/src/evaluators/aggregation/function/boolean_agg_fn.rs deleted file mode 100644 index 5fdca1ef7..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/function/boolean_agg_fn.rs +++ /dev/null @@ -1,65 +0,0 @@ -use super::agg_fn::AggFn; - -/// Placeholder struct for the implementation of the [[AggFn]] for `First` on -/// booleans. -pub struct FirstBoolean {} -impl AggFn for FirstBoolean { - type InT = bool; - type AccT = Option; - type OutT = bool; - - fn zero() -> Self::AccT { - None - } - - fn merge(acc1: &mut Self::AccT, acc2: &Self::AccT) { - if acc1.is_none() { - *acc1 = acc2.to_owned() - } - } - - fn extract(acc: &Self::AccT) -> Option { - acc.as_ref().map(|s| s.to_owned()) - } - - fn add_one(acc: &mut Self::AccT, input: &Self::InT) { - if acc.is_none() { - *acc = Some(input.to_owned()) - } - } - - fn name() -> &'static str { - "first_string" - } -} - -/// Placeholder struct for the implementation of the [[AggFn]] for `Last` on -/// booleans. -pub struct LastBoolean {} -impl AggFn for LastBoolean { - type InT = bool; - type AccT = Option; - type OutT = bool; - - fn zero() -> Self::AccT { - None - } - - fn merge(acc1: &mut Self::AccT, acc2: &Self::AccT) { - if acc2.is_some() { - *acc1 = acc2.to_owned() - } - } - - fn extract(acc: &Self::AccT) -> Option { - acc.as_ref().map(|s| s.to_owned()) - } - - fn add_one(acc: &mut Self::AccT, input: &Self::InT) { - *acc = Some(input.to_owned()) - } - - fn name() -> &'static str { - "last_string" - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/function/primitive_agg_fn.rs b/crates/sparrow-instructions/src/evaluators/aggregation/function/primitive_agg_fn.rs index f24f2a157..259b01ded 100644 --- a/crates/sparrow-instructions/src/evaluators/aggregation/function/primitive_agg_fn.rs +++ b/crates/sparrow-instructions/src/evaluators/aggregation/function/primitive_agg_fn.rs @@ -70,101 +70,6 @@ where } } -/// Placeholder struct for the implementation of the [[AggFn]] for -/// `first` aggregation on primitives. -pub struct FirstPrimitive -where - T::Native: Copy, -{ - // Make the compiler happy by using the type parameter. Conceptually, the aggregation - // will store values of type T. - _phantom: PhantomData, -} - -impl ArrowAggFn for FirstPrimitive { - type InArrowT = T; - type OutArrowT = T; -} - -impl AggFn for FirstPrimitive { - type InT = T::Native; - type AccT = Option; - type OutT = T::Native; - - fn zero() -> Self::AccT { - None - } - - fn one(input: &Self::InT) -> Self::AccT { - Some(*input) - } - - fn merge(acc1: &mut Self::AccT, acc2: &Self::AccT) { - if acc2.is_some() && acc1.is_none() { - *acc1 = *acc2; - } - } - - fn extract(acc: &Self::AccT) -> Option { - *acc - } - - fn add_one(acc: &mut Self::AccT, input: &Self::InT) { - if acc.is_none() { - *acc = Some(*input) - } - } - - fn name() -> &'static str { - "first" - } -} - -/// Placeholder struct for the implementation of the [[AggFn]] for -/// `last` aggregation on primitives. -pub struct LastPrimitive { - // Make the compiler happy by using the type parameter. Conceptually, the aggregation - // will store values of type T. - _phantom: PhantomData, -} - -impl ArrowAggFn for LastPrimitive { - type InArrowT = T; - type OutArrowT = T; -} - -impl AggFn for LastPrimitive { - type InT = T::Native; - type AccT = Option; - type OutT = T::Native; - - fn zero() -> Self::AccT { - None - } - - fn one(input: &Self::InT) -> Self::AccT { - Some(*input) - } - - fn merge(acc1: &mut Self::AccT, acc2: &Self::AccT) { - if acc2.is_some() { - *acc1 = *acc2 - } - } - - fn extract(acc: &Self::AccT) -> Option { - *acc - } - - fn add_one(acc: &mut Self::AccT, input: &Self::InT) { - *acc = Some(*input) - } - - fn name() -> &'static str { - "last" - } -} - /// Placeholder struct for the implementation of the [[AggFn]] for `max` /// aggregation. pub struct Max diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/function/string_agg_fn.rs b/crates/sparrow-instructions/src/evaluators/aggregation/function/string_agg_fn.rs deleted file mode 100644 index aaa89cd6a..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/function/string_agg_fn.rs +++ /dev/null @@ -1,65 +0,0 @@ -use super::agg_fn::AggFn; - -/// Placeholder struct for the implementation of the [[AggFn]] for `First` on -/// strings. -pub struct FirstString {} -impl AggFn for FirstString { - type InT = String; - type AccT = Option; - type OutT = String; - - fn zero() -> Self::AccT { - None - } - - fn merge(acc1: &mut Self::AccT, acc2: &Self::AccT) { - if acc1.is_none() { - *acc1 = acc2.to_owned() - } - } - - fn extract(acc: &Self::AccT) -> Option { - acc.as_ref().map(|s| s.to_owned()) - } - - fn add_one(acc: &mut Self::AccT, input: &Self::InT) { - if acc.is_none() { - *acc = Some(input.to_owned()) - } - } - - fn name() -> &'static str { - "first_string" - } -} - -/// Placeholder struct for the implementation of the [[AggFn]] for `Last` on -/// strings. -pub struct LastString {} -impl AggFn for LastString { - type InT = String; - type AccT = Option; - type OutT = String; - - fn zero() -> Self::AccT { - None - } - - fn merge(acc1: &mut Self::AccT, acc2: &Self::AccT) { - if acc2.is_some() { - *acc1 = acc2.to_owned() - } - } - - fn extract(acc: &Self::AccT) -> Option { - acc.as_ref().map(|s| s.to_owned()) - } - - fn add_one(acc: &mut Self::AccT, input: &Self::InT) { - *acc = Some(input.to_owned()) - } - - fn name() -> &'static str { - "last_string" - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/last_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/last_evaluator.rs new file mode 100644 index 000000000..bded8cd48 --- /dev/null +++ b/crates/sparrow-instructions/src/evaluators/aggregation/last_evaluator.rs @@ -0,0 +1,396 @@ +use crate::ValueRef; +use crate::{ + AggregationArgs, ArrayRefAccumToken, Evaluator, EvaluatorFactory, RuntimeInfo, StateToken, + StaticInfo, +}; +use arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray, UInt32Array}; + +/// Evaluator for the `Last` instruction. +pub struct LastEvaluator { + args: AggregationArgs, + token: ArrayRefAccumToken, +} + +impl Evaluator for LastEvaluator { + fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { + match &self.args { + AggregationArgs::NoWindow { input } => { + let grouping = info.grouping(); + let input_vals = info.value(input)?.array_ref()?; + let result = Self::aggregate( + &mut self.token, + grouping.num_groups(), + grouping.group_indices(), + &input_vals, + ); + + result + } + AggregationArgs::Since { ticks, input } => { + let grouping = info.grouping(); + let input_vals = info.value(input)?.array_ref()?; + let ticks = info.value(ticks)?.boolean_array()?; + let result = Self::aggregate_since( + &mut self.token, + grouping.num_groups(), + grouping.group_indices(), + &input_vals, + ticks.as_ref(), + ); + + result + } + AggregationArgs::Sliding { .. } => { + panic!("expected non-windowed or since-windowed aggregation, saw sliding.") + } + } + } + + fn state_token(&self) -> Option<&dyn StateToken> { + Some(&self.token) + } + + fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { + Some(&mut self.token) + } +} + +impl EvaluatorFactory for LastEvaluator { + fn try_new(info: StaticInfo<'_>) -> anyhow::Result> { + let args = AggregationArgs::from_input(info.args)?; + match args { + AggregationArgs::NoWindow { .. } | AggregationArgs::Since { .. } => { + let token = ArrayRefAccumToken::empty(info.result_type); + Ok(Box::new(Self { token, args })) + } + AggregationArgs::Sliding { .. } => { + unimplemented!("sliding window aggregation over list unsupported") + } + } + } +} + +impl LastEvaluator { + /// Resizes the accumulator to the new size. + fn ensure_entity_capacity(token: &mut ArrayRefAccumToken, len: usize) -> anyhow::Result<()> { + token.resize(len) + } + + /// Returns the existing value for an entity if it exists, or a new value from the + /// input if it exists, or null if neither. + /// + /// Takes advantage of the `take` and `concat` kernels to avoid having to type the + /// evaluator, keeping everything as ArrayRefs. + /// + /// The output is taken from the concatenated batch of the old state and the new input. + /// If the old state's value is null, then the take index for that entity is the length + /// of the old state plus the current index (i.e. the index into the new input). + /// If not, then we keep the take index as the old state's index. + fn aggregate( + token: &mut ArrayRefAccumToken, + key_capacity: usize, + key_indices: &UInt32Array, + input: &ArrayRef, + ) -> anyhow::Result { + Self::ensure_entity_capacity(token, key_capacity)?; + + let mut take_new_state: Vec = (0..token.accum.len() as u32).collect(); + let mut take_output_builder = UInt32Array::builder(input.len()); + for input_index in 0..input.len() { + let entity_index = key_indices.value(input_index) as usize; + if input.is_valid(input_index) { + take_new_state[entity_index] = (input_index + take_new_state.len()) as u32; + } + take_output_builder.append_value(take_new_state[entity_index]) + } + + // Gather the output, using the previous state and the new input + let output = + sparrow_arrow::concat_take(&token.accum, input, &take_output_builder.finish())?; + + // Update the state token with the new state + let take_new_state = PrimitiveArray::from_iter_values(take_new_state); + let new_state = sparrow_arrow::concat_take(&token.accum, input, &take_new_state)?; + token.set_state(new_state); + + Ok(output) + } + + /// Returns the existing value for an entity if it exists, or a new value from the + /// input if it exists, or null if neither. + /// + /// Takes advantage of the `take` and `concat` kernels to avoid having to type the + /// evaluator, keeping everything as ArrayRefs. + /// + /// The output is taken from the concatenated batch of the old state and the new input. + /// If the old state's value is null, then the take index for that entity is the length + /// of the old state plus the current index (i.e. the index into the new input). + /// If not, then we keep the take index as the old state's index. + fn aggregate_since( + token: &mut ArrayRefAccumToken, + key_capacity: usize, + key_indices: &UInt32Array, + input: &ArrayRef, + ticks: &BooleanArray, + ) -> anyhow::Result { + Self::ensure_entity_capacity(token, key_capacity)?; + + let mut take_new_state: Vec> = (0..token.accum.len()) + .map(|index| { + if token.accum.is_valid(index) { + Some(index as u32) + } else { + None + } + }) + .collect(); + + let mut take_output_builder = UInt32Array::builder(input.len()); + for input_index in 0..input.len() { + let entity_index = key_indices.value(input_index) as usize; + if input.is_valid(input_index) { + take_new_state[entity_index] = Some((input_index + take_new_state.len()) as u32); + } + take_output_builder.append_option(take_new_state[entity_index]); + if ticks.value(input_index) && ticks.is_valid(input_index) { + take_new_state[entity_index] = None; + } + } + + // Gather the output, using the previous state and the new input + let output = + sparrow_arrow::concat_take(&token.accum, input, &take_output_builder.finish())?; + + // Update the state token with the new state + let take_new_state = PrimitiveArray::from_iter(take_new_state); + let new_state = sparrow_arrow::concat_take(&token.accum, input, &take_new_state)?; + token.set_state(new_state); + + Ok(output) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{AsArray, Float64Array, Int64Builder, ListBuilder, MapBuilder}; + use arrow_schema::{DataType, Field, Fields}; + use std::sync::Arc; + + fn empty_list_i64_token() -> ArrayRefAccumToken { + let f = Arc::new(Field::new("item", DataType::Int64, true)); + let list = DataType::List(f); + ArrayRefAccumToken::empty(&list) + } + + #[test] + fn test_last_list_multiple_batches() { + let mut token = empty_list_i64_token(); + let key_indices = UInt32Array::from(vec![0, 0, 0, 0, 0, 0]); + let key_capacity = 1; + + // Batch 1 + let mut builder = ListBuilder::new(Int64Builder::new()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append_value([Some(4), None, Some(5)]); + builder.append_value([None, None]); + builder.append(false); + builder.append_value([]); + builder.append_value([Some(7), Some(8), Some(9)]); + + let array = builder.finish(); + + let input: ArrayRef = Arc::new(array); + let result = + LastEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); + let result = result.as_list(); + + let mut builder = ListBuilder::new(Int64Builder::new()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append_value([Some(4), None, Some(5)]); + builder.append_value([None, None]); + builder.append_value([None, None]); + builder.append_value([]); + builder.append_value([Some(7), Some(8), Some(9)]); + let expected = builder.finish(); + + assert_eq!(&expected, result); + + // Batch 2 + let mut builder = ListBuilder::new(Int64Builder::new()); + builder.append_value([Some(10), Some(11)]); + builder.append(true); + builder.append_value([Some(13), None]); + builder.append(false); + builder.append(false); + builder.append_value([Some(14)]); + + let array = builder.finish(); + let input: ArrayRef = Arc::new(array); + + // Introduce more entities + let key_indices = UInt32Array::from(vec![0, 1, 2, 1, 0, 1]); + let key_capacity = 3; + let result = + LastEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); + let result = result.as_list(); + + let mut builder = ListBuilder::new(Int64Builder::new()); + builder.append_value([Some(10), Some(11)]); + builder.append(true); + builder.append_value([Some(13), None]); + builder.append(true); + builder.append_value([Some(10), Some(11)]); + builder.append_value([Some(14)]); + let expected = builder.finish(); + + assert_eq!(&expected, result); + } + + fn empty_map_i64_token() -> ArrayRefAccumToken { + let k = Field::new("keys", DataType::Int64, false); + let v = Field::new("values", DataType::Int64, true); + let fields = Fields::from(vec![k, v]); + let s = Arc::new(Field::new("entries", DataType::Struct(fields), false)); + let map = DataType::Map(s, false); + ArrayRefAccumToken::empty(&map) + } + + #[test] + fn test_larst_map_multiple_batches() { + let mut token = empty_map_i64_token(); + let key_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); + let key_capacity = 1; + + // Batch 1 + let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); + builder.keys().append_value(1); + builder.values().append_value(1); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(2); + builder.keys().append_value(2); + builder.values().append_value(4); + builder.append(true).unwrap(); + + builder.append(true).unwrap(); + + builder.keys().append_value(2); + builder.values().append_value(99); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(10); + builder.keys().append_value(3); + builder.values().append_value(7); + builder.append(true).unwrap(); + let array = builder.finish(); + + // Last should pull latest + let expected = array.clone(); + + let input: ArrayRef = Arc::new(array); + let result = + LastEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); + let result = result.as_map(); + + assert_eq!(&expected, result); + + // Batch 2 + let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); + builder.append(false).unwrap(); + + builder.append(false).unwrap(); + + builder.append(false).unwrap(); + + builder.keys().append_value(2); + builder.values().append_value(99); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(10); + builder.keys().append_value(3); + builder.values().append_value(7); + builder.append(true).unwrap(); + + let array = builder.finish(); + let input: ArrayRef = Arc::new(array); + + // Introduce second entity key + let key_indices = UInt32Array::from(vec![0, 1, 0, 1, 0]); + let key_capacity = 2; + let result = + LastEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); + let result = result.as_map(); + + let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); + // Uses last non-null value + builder.keys().append_value(1); + builder.values().append_value(10); + builder.keys().append_value(3); + builder.values().append_value(7); + builder.append(true).unwrap(); + + // First value for this entity is null, so result is null + builder.append(false).unwrap(); + + // Uses last non-null value + builder.keys().append_value(1); + builder.values().append_value(10); + builder.keys().append_value(3); + builder.values().append_value(7); + builder.append(true).unwrap(); + + builder.keys().append_value(2); + builder.values().append_value(99); + builder.append(true).unwrap(); + + builder.keys().append_value(1); + builder.values().append_value(10); + builder.keys().append_value(3); + builder.values().append_value(7); + builder.append(true).unwrap(); + + let expected = builder.finish(); + + assert_eq!(&expected, result); + } + + #[test] + fn test_last_f64() { + let mut token = ArrayRefAccumToken::empty(&DataType::Float64); + let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); + let input: ArrayRef = Arc::new(Float64Array::from(vec![ + Some(1.0), + Some(2.0), + None, + None, + Some(3.0), + ])); + + let output = LastEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); + + let output: &Float64Array = output.as_primitive(); + assert_eq!( + output, + &Float64Array::from(vec![Some(1.0), Some(2.0), None, Some(2.0), Some(3.0)]) + ); + + let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); + let input: ArrayRef = Arc::new(Float64Array::from(vec![ + None, + Some(4.0), + Some(5.0), + None, + None, + ])); + let output = LastEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); + let output: &Float64Array = output.as_primitive(); + assert_eq!( + output, + &Float64Array::from(vec![Some(1.0), Some(4.0), Some(5.0), Some(4.0), Some(4.0)]) + ); + } +} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/list.rs b/crates/sparrow-instructions/src/evaluators/aggregation/list.rs deleted file mode 100644 index 19905c335..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/list.rs +++ /dev/null @@ -1,7 +0,0 @@ -//! List aggregation evaluators. - -mod first_list_evaluator; -mod last_list_evaluator; - -pub use first_list_evaluator::*; -pub use last_list_evaluator::*; diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/list/first_list_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/list/first_list_evaluator.rs deleted file mode 100644 index ba5c3ada2..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/list/first_list_evaluator.rs +++ /dev/null @@ -1,197 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use crate::{ - AggregationArgs, Evaluator, EvaluatorFactory, ListAccumToken, RuntimeInfo, StateToken, - StaticInfo, -}; -use arrow::array::{ - as_list_array, new_empty_array, Array, ArrayRef, AsArray, PrimitiveArray, UInt32Array, -}; - -/// Evaluator for the `First` instruction on lists -pub struct FirstListEvaluator { - args: AggregationArgs, - token: ListAccumToken, -} - -impl Evaluator for FirstListEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::NoWindow { input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let result = Self::aggregate( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ); - - result - } - AggregationArgs::Since { ticks: _, input: _ } => { - unimplemented!("windowed aggregation over lists") - } - AggregationArgs::Sliding { .. } => { - panic!("expected non-windowed or since-windowed aggregation, saw sliding.") - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl EvaluatorFactory for FirstListEvaluator { - fn try_new(info: StaticInfo<'_>) -> anyhow::Result> { - let args = AggregationArgs::from_input(info.args)?; - match args { - AggregationArgs::NoWindow { .. } | AggregationArgs::Since { .. } => { - let list_type = info.result_type; - let accum = new_empty_array(list_type).as_list::().to_owned(); - let token = ListAccumToken::new(Arc::new(accum)); - Ok(Box::new(Self { token, args })) - } - AggregationArgs::Sliding { .. } => { - unimplemented!("sliding window aggregation over list unsupported") - } - } - } -} - -impl FirstListEvaluator { - /// Resizes the accumulator to the new size. - fn ensure_entity_capacity(token: &mut ListAccumToken, len: usize) -> anyhow::Result<()> { - token.resize(len) - } - - /// Returns the existing value for an entity if it exists, or a new value from the - /// input if it exists, or null if neither. - /// - /// Takes advantage of the `take` and `concat` kernels to avoid having to type the - /// evaluator, keeping everything as ArrayRefs. - /// - /// The output is taken from the concatenated batch of the old state and the new input. - /// If the old state's value is null, then the take index for that entity is the length - /// of the old state plus the current index (i.e. the index into the new input). - /// If not, then we keep the take index as the old state's index. - fn aggregate( - token: &mut ListAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - ) -> anyhow::Result { - Self::ensure_entity_capacity(token, key_capacity)?; - let list_input = as_list_array(input); - - let mut take_new_state: Vec = (0..token.accum.len() as u32).collect(); - let mut take_output_builder = UInt32Array::builder(input.len()); - for input_index in 0..list_input.len() { - let entity_index = key_indices.value(input_index); - if token.value_is_null(entity_index) && list_input.is_valid(input_index) { - // If the `take_new_state[entity_index]` is greater than the length, that - // means it has been set already, so we should not overwrite it. - let not_taken = take_new_state[entity_index as usize] < take_new_state.len() as u32; - if not_taken { - take_new_state[entity_index as usize] = - (input_index + take_new_state.len()) as u32; - } - }; - - take_output_builder.append_value(take_new_state[entity_index as usize]) - } - - // Gather the output, using the previous state and the new input - let output = - sparrow_arrow::concat_take(&token.accum, input, &take_output_builder.finish())?; - - // Update the state token with the new state - let take_new_state = PrimitiveArray::from_iter_values(take_new_state); - let new_state = sparrow_arrow::concat_take(&token.accum, input, &take_new_state)?; - token.set_state(new_state); - - Ok(output) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow::array::{AsArray, Int64Builder, ListBuilder}; - use arrow_schema::{DataType, Field}; - use std::sync::Arc; - - fn default_token() -> ListAccumToken { - let f = Arc::new(Field::new("item", DataType::Int64, true)); - let list = DataType::List(f); - let accum = new_empty_array(&list); - ListAccumToken { accum } - } - - #[test] - fn test_first_list_multiple_batches() { - let mut token = default_token(); - let key_indices = UInt32Array::from(vec![0, 0, 0, 0, 0, 0]); - let key_capacity = 1; - - // Batch 1 - let mut builder = ListBuilder::new(Int64Builder::new()); - builder.append_value([Some(1), Some(2), Some(3)]); - builder.append_value([Some(4), None, Some(5)]); - builder.append_value([None, None]); - builder.append(false); - builder.append_value([]); - builder.append_value([Some(7), Some(8), Some(9)]); - - let array = builder.finish(); - - let input: ArrayRef = Arc::new(array); - let result = - FirstListEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); - let result = result.as_list(); - - let mut builder = ListBuilder::new(Int64Builder::new()); - for _ in 0..6 { - builder.append_value([Some(1), Some(2), Some(3)]); - } - let expected = builder.finish(); - - assert_eq!(&expected, result); - - // Batch 2 - let mut builder = ListBuilder::new(Int64Builder::new()); - builder.append_value([Some(10), Some(11)]); - builder.append(true); - builder.append_value([Some(13), None]); - builder.append(false); - builder.append(false); - builder.append_value([Some(14)]); - - let array = builder.finish(); - let input: ArrayRef = Arc::new(array); - - // Introduce more entities - let key_indices = UInt32Array::from(vec![0, 1, 2, 1, 0, 1]); - let key_capacity = 3; - let result = - FirstListEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); - let result = result.as_list(); - - let mut builder = ListBuilder::new(Int64Builder::new()); - builder.append_value([Some(1), Some(2), Some(3)]); - builder.append(true); - builder.append_value([Some(13), None]); - builder.append(true); - builder.append_value([Some(1), Some(2), Some(3)]); - builder.append(true); - let expected = builder.finish(); - - assert_eq!(&expected, result); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/list/last_list_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/list/last_list_evaluator.rs deleted file mode 100644 index 2a252b5b2..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/list/last_list_evaluator.rs +++ /dev/null @@ -1,194 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use crate::{ - AggregationArgs, Evaluator, EvaluatorFactory, ListAccumToken, RuntimeInfo, StateToken, - StaticInfo, -}; -use arrow::array::{ - as_list_array, new_empty_array, Array, ArrayRef, AsArray, PrimitiveArray, UInt32Array, -}; - -/// Evaluator for the `Last` instruction on lists -pub struct LastListEvaluator { - args: AggregationArgs, - token: ListAccumToken, -} - -impl Evaluator for LastListEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::NoWindow { input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let result = Self::aggregate( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ); - - result - } - AggregationArgs::Since { ticks: _, input: _ } => { - unimplemented!("windowed aggregation over lists") - } - AggregationArgs::Sliding { .. } => { - panic!("expected non-windowed or since-windowed aggregation, saw sliding.") - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl EvaluatorFactory for LastListEvaluator { - fn try_new(info: StaticInfo<'_>) -> anyhow::Result> { - let args = AggregationArgs::from_input(info.args)?; - match args { - AggregationArgs::NoWindow { .. } | AggregationArgs::Since { .. } => { - let list_type = info.result_type; - let accum = new_empty_array(list_type).as_list::().to_owned(); - let token = ListAccumToken::new(Arc::new(accum)); - Ok(Box::new(Self { token, args })) - } - AggregationArgs::Sliding { .. } => { - unimplemented!("sliding window aggregation over list unsupported") - } - } - } -} - -impl LastListEvaluator { - /// Resizes the accumulator to the new size. - fn ensure_entity_capacity(token: &mut ListAccumToken, len: usize) -> anyhow::Result<()> { - token.resize(len) - } - - /// Returns the existing value for an entity if it exists, or a new value from the - /// input if it exists, or null if neither. - /// - /// Takes advantage of the `take` and `concat` kernels to avoid having to type the - /// evaluator, keeping everything as ArrayRefs. - /// - /// The output is taken from the concatenated batch of the old state and the new input. - /// If the old state's value is null, then the take index for that entity is the length - /// of the old state plus the current index (i.e. the index into the new input). - /// If not, then we keep the take index as the old state's index. - fn aggregate( - token: &mut ListAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - ) -> anyhow::Result { - Self::ensure_entity_capacity(token, key_capacity)?; - let list_input = as_list_array(input); - - let mut take_new_state: Vec = (0..token.accum.len() as u32).collect(); - let mut take_output_builder = UInt32Array::builder(input.len()); - - for input_index in 0..list_input.len() { - let entity_index = key_indices.value(input_index); - if list_input.is_valid(input_index) { - take_new_state[entity_index as usize] = (input_index + take_new_state.len()) as u32; - } - take_output_builder.append_value(take_new_state[entity_index as usize]) - } - - // Gather the output, using the previous state and the new input - let output = - sparrow_arrow::concat_take(&token.accum, input, &take_output_builder.finish())?; - - // Update the state token with the new state - let take_new_state = PrimitiveArray::from_iter_values(take_new_state); - let new_state = sparrow_arrow::concat_take(&token.accum, input, &take_new_state)?; - token.set_state(new_state); - - Ok(output) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow::array::{AsArray, Int64Builder, ListBuilder}; - use arrow_schema::{DataType, Field}; - use std::sync::Arc; - - fn default_token() -> ListAccumToken { - let f = Arc::new(Field::new("item", DataType::Int64, true)); - let list = DataType::List(f); - let accum = new_empty_array(&list); - ListAccumToken { accum } - } - - #[test] - fn test_last_list_multiple_batches() { - let mut token = default_token(); - let key_indices = UInt32Array::from(vec![0, 0, 0, 0, 0, 0]); - let key_capacity = 1; - - // Batch 1 - let mut builder = ListBuilder::new(Int64Builder::new()); - builder.append_value([Some(1), Some(2), Some(3)]); - builder.append_value([Some(4), None, Some(5)]); - builder.append_value([None, None]); - builder.append(false); - builder.append_value([]); - builder.append_value([Some(7), Some(8), Some(9)]); - - let array = builder.finish(); - - let input: ArrayRef = Arc::new(array); - let result = - LastListEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); - let result = result.as_list(); - - let mut builder = ListBuilder::new(Int64Builder::new()); - builder.append_value([Some(1), Some(2), Some(3)]); - builder.append_value([Some(4), None, Some(5)]); - builder.append_value([None, None]); - builder.append_value([None, None]); - builder.append_value([]); - builder.append_value([Some(7), Some(8), Some(9)]); - let expected = builder.finish(); - - assert_eq!(&expected, result); - - // Batch 2 - let mut builder = ListBuilder::new(Int64Builder::new()); - builder.append_value([Some(10), Some(11)]); - builder.append(true); - builder.append_value([Some(13), None]); - builder.append(false); - builder.append(false); - builder.append_value([Some(14)]); - - let array = builder.finish(); - let input: ArrayRef = Arc::new(array); - - // Introduce more entities - let key_indices = UInt32Array::from(vec![0, 1, 2, 1, 0, 1]); - let key_capacity = 3; - let result = - LastListEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); - let result = result.as_list(); - - let mut builder = ListBuilder::new(Int64Builder::new()); - builder.append_value([Some(10), Some(11)]); - builder.append(true); - builder.append_value([Some(13), None]); - builder.append(true); - builder.append_value([Some(10), Some(11)]); - builder.append_value([Some(14)]); - let expected = builder.finish(); - - assert_eq!(&expected, result); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/map.rs b/crates/sparrow-instructions/src/evaluators/aggregation/map.rs deleted file mode 100644 index 3d14b8a54..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/map.rs +++ /dev/null @@ -1,7 +0,0 @@ -//! Map aggregation evaluators. - -mod first_map_evaluator; -mod last_map_evaluator; - -pub use first_map_evaluator::*; -pub use last_map_evaluator::*; diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/map/first_map_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/map/first_map_evaluator.rs deleted file mode 100644 index fbbb12f30..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/map/first_map_evaluator.rs +++ /dev/null @@ -1,248 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use crate::{ - AggregationArgs, Evaluator, EvaluatorFactory, MapAccumToken, RuntimeInfo, StateToken, - StaticInfo, -}; -use arrow::array::{ - as_map_array, new_empty_array, Array, ArrayRef, AsArray, PrimitiveArray, UInt32Array, -}; - -/// Evaluator for the `First` instruction on maps -pub struct FirstMapEvaluator { - args: AggregationArgs, - token: MapAccumToken, -} - -impl Evaluator for FirstMapEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::NoWindow { input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let result = Self::aggregate( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ); - - result - } - AggregationArgs::Since { ticks: _, input: _ } => { - unimplemented!("windowed aggregation over maps") - } - AggregationArgs::Sliding { .. } => { - panic!("expected non-windowed or since-windowed aggregation, saw sliding.") - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl EvaluatorFactory for FirstMapEvaluator { - fn try_new(info: StaticInfo<'_>) -> anyhow::Result> { - let args = AggregationArgs::from_input(info.args)?; - match args { - AggregationArgs::NoWindow { .. } | AggregationArgs::Since { .. } => { - let map_type = info.result_type; - let accum = new_empty_array(map_type).as_map().to_owned(); - let token = MapAccumToken::new(Arc::new(accum)); - Ok(Box::new(Self { token, args })) - } - AggregationArgs::Sliding { .. } => { - unimplemented!("sliding window aggregation over maps unsupported") - } - } - } -} - -impl FirstMapEvaluator { - /// Resizes the accumulator to the new size. - fn ensure_entity_capacity(token: &mut MapAccumToken, len: usize) -> anyhow::Result<()> { - token.resize(len) - } - - /// Returns the existing value for an entity if it exists, or a new value from the - /// input if it exists, or null if neither. - /// - /// Takes advantage of the `take` and `concat` kernels to avoid having to type the - /// evaluator, keeping everything as ArrayRefs. - /// - /// The output is taken from the concatenated batch of the old state and the new input. - /// If the old state's value is null, then the take index for that entity is the length - /// of the old state plus the current index (i.e. the index into the new input). - /// If not, then we keep the take index as the old state's index. - fn aggregate( - token: &mut MapAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - ) -> anyhow::Result { - Self::ensure_entity_capacity(token, key_capacity)?; - let map_input = as_map_array(input); - - let mut take_new_state: Vec = (0..token.accum.len() as u32).collect(); - let mut take_output_builder = UInt32Array::builder(input.len()); - for input_index in 0..map_input.len() { - let entity_index = key_indices.value(input_index); - if token.value_is_null(entity_index) && map_input.is_valid(input_index) { - // If the `take_new_state[entity_index]` is greater than the length, that - // means it has been set already, so we should not overwrite it. - let not_taken = take_new_state[entity_index as usize] < take_new_state.len() as u32; - if not_taken { - take_new_state[entity_index as usize] = - (input_index + take_new_state.len()) as u32; - } - }; - - take_output_builder.append_value(take_new_state[entity_index as usize]) - } - - // Gather the output, using the previous state and the new input - let output = - sparrow_arrow::concat_take(&token.accum, input, &take_output_builder.finish())?; - - // Update the state token with the new state - let take_new_state = PrimitiveArray::from_iter_values(take_new_state); - let new_state = sparrow_arrow::concat_take(&token.accum, input, &take_new_state)?; - token.set_state(new_state); - - Ok(output) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow::array::{AsArray, Int64Builder, MapBuilder}; - use arrow_schema::{DataType, Field, Fields}; - use std::sync::Arc; - - fn default_token() -> MapAccumToken { - let k = Field::new("keys", DataType::Int64, false); - let v = Field::new("values", DataType::Int64, true); - let fields = Fields::from(vec![k, v]); - let s = Arc::new(Field::new("entries", DataType::Struct(fields), false)); - let map = DataType::Map(s, false); - let accum = new_empty_array(&map); - MapAccumToken { accum } - } - - #[test] - fn test_first_map_multiple_batches() { - let mut token = default_token(); - let key_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); - let key_capacity = 1; - - // Batch 1 - let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); - builder.keys().append_value(1); - builder.values().append_value(1); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(2); - builder.keys().append_value(2); - builder.values().append_value(4); - builder.append(true).unwrap(); - - builder.append(true).unwrap(); - - builder.keys().append_value(2); - builder.values().append_value(99); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(10); - builder.keys().append_value(3); - builder.values().append_value(7); - builder.append(true).unwrap(); - let array = builder.finish(); - - let input: ArrayRef = Arc::new(array); - let result = - FirstMapEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); - let result = result.as_map(); - - let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); - for _ in 0..5 { - builder.keys().append_value(1); - builder.values().append_value(1); - builder.append(true).unwrap(); - } - let expected = builder.finish(); - - assert_eq!(&expected, result); - - // Batch 2 - let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); - builder.keys().append_value(1); - builder.values().append_value(1); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(2); - builder.keys().append_value(2); - builder.values().append_value(4); - builder.append(true).unwrap(); - - builder.append(true).unwrap(); - - builder.keys().append_value(2); - builder.values().append_value(99); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(10); - builder.keys().append_value(3); - builder.values().append_value(7); - builder.append(true).unwrap(); - - let array = builder.finish(); - let input: ArrayRef = Arc::new(array); - - // Introduce second entity key - let key_indices = UInt32Array::from(vec![0, 1, 0, 1, 0]); - let key_capacity = 2; - let result = - FirstMapEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); - let result = result.as_map(); - - let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); - builder.keys().append_value(1); - builder.values().append_value(1); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(2); - builder.keys().append_value(2); - builder.values().append_value(4); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(1); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(2); - builder.keys().append_value(2); - builder.values().append_value(4); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(1); - builder.append(true).unwrap(); - let expected = builder.finish(); - - assert_eq!(&expected, result); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/map/last_map_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/map/last_map_evaluator.rs deleted file mode 100644 index f2e8882e7..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/map/last_map_evaluator.rs +++ /dev/null @@ -1,236 +0,0 @@ -use std::sync::Arc; - -use arrow::array::{ - as_map_array, new_empty_array, Array, ArrayRef, AsArray, PrimitiveArray, UInt32Array, -}; - -use crate::ValueRef; - -use crate::{ - AggregationArgs, Evaluator, EvaluatorFactory, MapAccumToken, RuntimeInfo, StateToken, - StaticInfo, -}; - -/// Evaluator for the `Last` instruction on maps -pub struct LastMapEvaluator { - args: AggregationArgs, - token: MapAccumToken, -} - -impl Evaluator for LastMapEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::NoWindow { input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let result = Self::aggregate( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ); - - result - } - AggregationArgs::Since { ticks: _, input: _ } => { - unimplemented!("windowed aggregation over maps") - } - AggregationArgs::Sliding { .. } => { - panic!("expected non-windowed or since-windowed aggregation, saw sliding.") - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl EvaluatorFactory for LastMapEvaluator { - fn try_new(info: StaticInfo<'_>) -> anyhow::Result> { - let args = AggregationArgs::from_input(info.args)?; - match args { - AggregationArgs::NoWindow { .. } | AggregationArgs::Since { .. } => { - let map_type = info.result_type; - let accum = new_empty_array(map_type).as_map().to_owned(); - let token = MapAccumToken::new(Arc::new(accum)); - Ok(Box::new(Self { token, args })) - } - AggregationArgs::Sliding { .. } => { - unimplemented!("sliding window aggregation over maps unsupported") - } - } - } -} - -impl LastMapEvaluator { - /// Resizes the accumulator to the new size. - fn ensure_entity_capacity(token: &mut MapAccumToken, len: usize) -> anyhow::Result<()> { - token.resize(len) - } - - /// Returns the existing value for an entity if it exists, or a new value from the - /// input if it exists, or null if neither. - /// - /// Takes advantage of the `take` and `concat` kernels to avoid having to type the - /// evaluator, keeping everything as ArrayRefs. - /// - /// The output is taken from the concatenated batch of the old state and the new input. - /// The take index for that entity is the length of the old state plus the current index - /// (i.e. the index into the new input), if the new input is valid. Else, it stays the - /// index into the old state. - fn aggregate( - token: &mut MapAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - ) -> anyhow::Result { - Self::ensure_entity_capacity(token, key_capacity)?; - let map_input = as_map_array(input); - - let mut take_new_state: Vec = (0..token.accum.len() as u32).collect(); - let mut take_output_builder = UInt32Array::builder(input.len()); - for input_index in 0..map_input.len() { - let entity_index = key_indices.value(input_index); - if map_input.is_valid(input_index) { - take_new_state[entity_index as usize] = (input_index + take_new_state.len()) as u32; - } - take_output_builder.append_value(take_new_state[entity_index as usize]) - } - - // Gather the output, using the previous state and the new input - let output = - sparrow_arrow::concat_take(&token.accum, input, &take_output_builder.finish())?; - - // Update the state token with the new state - let take_new_state = PrimitiveArray::from_iter_values(take_new_state); - let new_state = sparrow_arrow::concat_take(&token.accum, input, &take_new_state)?; - token.set_state(new_state); - - Ok(output) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow::array::{AsArray, Int64Builder, MapBuilder}; - use arrow_schema::{DataType, Field, Fields}; - use std::sync::Arc; - - fn default_token() -> MapAccumToken { - let k = Field::new("keys", DataType::Int64, false); - let v = Field::new("values", DataType::Int64, true); - let fields = Fields::from(vec![k, v]); - let s = Arc::new(Field::new("entries", DataType::Struct(fields), false)); - let map = DataType::Map(s, false); - let accum = new_empty_array(&map); - MapAccumToken { accum } - } - - #[test] - fn test_first_map_multiple_batches() { - let mut token = default_token(); - let key_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); - let key_capacity = 1; - - // Batch 1 - let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); - builder.keys().append_value(1); - builder.values().append_value(1); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(2); - builder.keys().append_value(2); - builder.values().append_value(4); - builder.append(true).unwrap(); - - builder.append(true).unwrap(); - - builder.keys().append_value(2); - builder.values().append_value(99); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(10); - builder.keys().append_value(3); - builder.values().append_value(7); - builder.append(true).unwrap(); - let array = builder.finish(); - - // Last should pull latest - let expected = array.clone(); - - let input: ArrayRef = Arc::new(array); - let result = - LastMapEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); - let result = result.as_map(); - - assert_eq!(&expected, result); - - // Batch 2 - let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); - builder.append(false).unwrap(); - - builder.append(false).unwrap(); - - builder.append(false).unwrap(); - - builder.keys().append_value(2); - builder.values().append_value(99); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(10); - builder.keys().append_value(3); - builder.values().append_value(7); - builder.append(true).unwrap(); - - let array = builder.finish(); - let input: ArrayRef = Arc::new(array); - - // Introduce second entity key - let key_indices = UInt32Array::from(vec![0, 1, 0, 1, 0]); - let key_capacity = 2; - let result = - LastMapEvaluator::aggregate(&mut token, key_capacity, &key_indices, &input).unwrap(); - let result = result.as_map(); - - let mut builder = MapBuilder::new(None, Int64Builder::new(), Int64Builder::new()); - // Uses last non-null value - builder.keys().append_value(1); - builder.values().append_value(10); - builder.keys().append_value(3); - builder.values().append_value(7); - builder.append(true).unwrap(); - - // First value for this entity is null, so result is null - builder.append(false).unwrap(); - - // Uses last non-null value - builder.keys().append_value(1); - builder.values().append_value(10); - builder.keys().append_value(3); - builder.values().append_value(7); - builder.append(true).unwrap(); - - builder.keys().append_value(2); - builder.values().append_value(99); - builder.append(true).unwrap(); - - builder.keys().append_value(1); - builder.values().append_value(10); - builder.keys().append_value(3); - builder.values().append_value(7); - builder.append(true).unwrap(); - - let expected = builder.finish(); - - assert_eq!(&expected, result); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/primitive/arrow_agg_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/primitive/arrow_agg_evaluator.rs index 4779a7428..7348ded28 100644 --- a/crates/sparrow-instructions/src/evaluators/aggregation/primitive/arrow_agg_evaluator.rs +++ b/crates/sparrow-instructions/src/evaluators/aggregation/primitive/arrow_agg_evaluator.rs @@ -375,7 +375,7 @@ mod tests { use arrow::datatypes::{Float64Type, Int64Type}; use super::*; - use crate::{FirstPrimitive, LastPrimitive, Max, Mean, Sum}; + use crate::{Max, Mean, Sum}; #[test] fn test_sum_f64() { @@ -652,103 +652,4 @@ mod tests { ]) ); } - - #[test] - fn test_last_f64() { - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); - let input: ArrayRef = Arc::new(Float64Array::from(vec![ - Some(1.0), - Some(2.0), - None, - None, - Some(3.0), - ])); - let mut accum = Vec::new(); - - let output = ArrowAggEvaluator::>::aggregate( - &mut accum, - 3, - &entity_indices, - &input, - ) - .unwrap(); - - let output = downcast_primitive_array::(output.as_ref()).unwrap(); - assert_eq!( - output, - &Float64Array::from(vec![Some(1.0), Some(2.0), None, Some(2.0), Some(3.0)]) - ); - - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); - let input: ArrayRef = Arc::new(Float64Array::from(vec![ - None, - Some(4.0), - Some(5.0), - None, - None, - ])); - let output = ArrowAggEvaluator::>::aggregate( - &mut accum, - 3, - &entity_indices, - &input, - ) - .unwrap(); - - let output = downcast_primitive_array::(output.as_ref()).unwrap(); - assert_eq!( - output, - &Float64Array::from(vec![Some(1.0), Some(4.0), Some(5.0), Some(4.0), Some(4.0)]) - ); - } - - #[test] - fn test_first_f64() { - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); - let input: ArrayRef = Arc::new(Float64Array::from(vec![ - Some(1.0), - Some(2.0), - None, - None, - Some(3.0), - ])); - let mut accum = Vec::new(); - - let output = ArrowAggEvaluator::>::aggregate( - &mut accum, - 3, - &entity_indices, - &input, - ) - .unwrap(); - - let output = downcast_primitive_array::(output.as_ref()).unwrap(); - - assert_eq!( - output, - &Float64Array::from(vec![Some(1.0), Some(2.0), None, Some(2.0), Some(2.0)]) - ); - - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); - let input: ArrayRef = Arc::new(Float64Array::from(vec![ - None, - Some(4.0), - Some(5.0), - None, - None, - ])); - let output = ArrowAggEvaluator::>::aggregate( - &mut accum, - 3, - &entity_indices, - &input, - ) - .unwrap(); - - let output = downcast_primitive_array::(output.as_ref()).unwrap(); - assert_eq!( - output, - &Float64Array::from(vec![Some(1.0), Some(2.0), Some(5.0), Some(2.0), Some(2.0)]) - ); - } } diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/primitive/two_stacks_arrow_agg_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/primitive/two_stacks_arrow_agg_evaluator.rs index ab600a80c..57dbd7aab 100644 --- a/crates/sparrow-instructions/src/evaluators/aggregation/primitive/two_stacks_arrow_agg_evaluator.rs +++ b/crates/sparrow-instructions/src/evaluators/aggregation/primitive/two_stacks_arrow_agg_evaluator.rs @@ -261,7 +261,7 @@ mod tests { use arrow::datatypes::{Float64Type, Int64Type}; use super::*; - use crate::{FirstPrimitive, LastPrimitive, Max, Mean, Min, Sum}; + use crate::{Max, Mean, Min, Sum}; #[test] fn test_sliding_sum_f64() { @@ -499,58 +499,4 @@ mod tests { let output = downcast_primitive_array::(output.as_ref()).unwrap(); assert_eq!(output, &Float64Array::from(vec![1.0, 1.5, 2.5, 3.0, 4.0])); } - - #[test] - fn test_sliding_first_f64() { - let entity_indices = UInt32Array::from(vec![0; 5]); - let input: ArrayRef = Arc::new(Float64Array::from(vec![1f64, 2.0, 3.0, 4.0, 5.0])); - let sliding = BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - Some(true), - ]); - let mut accum = Vec::new(); - - let output = TwoStacksArrowAggEvaluator::>::aggregate( - &mut accum, - 1, - &entity_indices, - &input, - 2, - &sliding, - ) - .unwrap(); - - let output = downcast_primitive_array::(output.as_ref()).unwrap(); - assert_eq!(output, &Float64Array::from(vec![1.0, 1.0, 2.0, 2.0, 3.0])); - } - - #[test] - fn test_sliding_last_f64() { - let entity_indices = UInt32Array::from(vec![0; 5]); - let input: ArrayRef = Arc::new(Float64Array::from(vec![1f64, 2.0, 3.0, 4.0, 5.0])); - let sliding = BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - Some(true), - ]); - let mut accum = Vec::new(); - - let output = TwoStacksArrowAggEvaluator::>::aggregate( - &mut accum, - 1, - &entity_indices, - &input, - 2, - &sliding, - ) - .unwrap(); - - let output = downcast_primitive_array::(output.as_ref()).unwrap(); - assert_eq!(output, &Float64Array::from(vec![1.0, 2.0, 3.0, 4.0, 5.0])); - } } diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/string.rs b/crates/sparrow-instructions/src/evaluators/aggregation/string.rs deleted file mode 100644 index 5e5420f57..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/string.rs +++ /dev/null @@ -1,10 +0,0 @@ -//! String aggregation evaluators. - -mod first_string_evaluator; -mod last_string_evaluator; - -mod two_stacks_first_string_evaluator; -mod two_stacks_last_string_evaluator; - -pub use first_string_evaluator::*; -pub use last_string_evaluator::*; diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/string/first_string_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/string/first_string_evaluator.rs deleted file mode 100644 index 376475308..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/string/first_string_evaluator.rs +++ /dev/null @@ -1,423 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use arrow::array::{Array, ArrayRef, BooleanArray, StringArray, UInt32Array}; -use itertools::izip; -use sparrow_arrow::downcast::downcast_string_array; - -use super::two_stacks_first_string_evaluator::TwoStacksFirstStringEvaluator; -use crate::{ - AggregationArgs, Evaluator, EvaluatorFactory, RuntimeInfo, StateToken, StaticInfo, - StringAccumToken, TwoStacksStringAccumToken, -}; - -/// Evaluator for the `First` instruction on strings. -pub struct FirstStringEvaluator { - args: AggregationArgs, - token: StringAccumToken, -} - -impl Evaluator for FirstStringEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::NoWindow { input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let result = Self::aggregate( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ); - - result - } - AggregationArgs::Since { ticks, input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let ticks = info.value(ticks)?.boolean_array()?; - let result = Self::aggregate_since( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ticks.as_ref(), - ); - - result - } - AggregationArgs::Sliding { .. } => { - unreachable!("Expected Non-windowed or Since windowed aggregation, saw Sliding.") - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl EvaluatorFactory for FirstStringEvaluator { - fn try_new(info: StaticInfo<'_>) -> anyhow::Result> { - let args = AggregationArgs::from_input(info.args)?; - match args { - AggregationArgs::NoWindow { .. } | AggregationArgs::Since { .. } => { - let token = StringAccumToken::default(); - Ok(Box::new(Self { token, args })) - } - AggregationArgs::Sliding { .. } => { - let token = TwoStacksStringAccumToken::new(); - Ok(Box::new(TwoStacksFirstStringEvaluator { token, args })) - } - } - } -} - -impl FirstStringEvaluator { - fn ensure_entity_capacity(token: &mut StringAccumToken, entity_id_len: usize) { - token.resize(entity_id_len); - } - - /// Updates the non-windowed accumulator based on the given flags. - /// - /// Implements a single row of the logic so that we can easily reuse it. - /// We choose to inline this so that it can be specialized in cases where - /// the valid bits are always true. - #[inline] - fn update_accum( - token: &mut StringAccumToken, - entity_index: u32, - input_is_valid: bool, - input: &str, - ) -> anyhow::Result> { - let value_to_emit = match token.get_value(entity_index)? { - Some(v) => Some(v), - None => { - if input_is_valid { - token.put_value(entity_index, Some(input.to_string()))?; - Some(input.to_string()) - } else { - None - } - } - }; - - Ok(value_to_emit) - } - - /// Updates the since-windowed accumulator based on the given flags. - /// - /// Accumulator behavior is to update -> emit -> reset, resulting in - /// exclusive start bounds and inclusive end bounds. - /// - /// Implements a single row of the logic so that we can easily reuse it. - /// We choose to inline this so that it can be specialized in cases where - /// the valid bits are always true. - #[inline] - fn update_since_accum( - token: &mut StringAccumToken, - entity_index: u32, - input_is_valid: bool, - since_is_valid: bool, - input: &str, - since_bool: bool, - ) -> anyhow::Result> { - let reset_window = since_is_valid && since_bool; - let value_to_emit = match token.get_value(entity_index)? { - Some(v) => Some(v), - None => { - // The value is not present. This means the window is ready to accept - // a first value. - if input_is_valid && reset_window { - // Here we know we're going to reset the window, so don't bother attempting to - // put a new value, since it'll be set to `None` afterwards. - Some(input.to_string()) - } else if input_is_valid { - // Use the valid new input - token.put_value(entity_index, Some(input.to_string()))?; - Some(input.to_string()) - } else { - // No new input is present, so the result is null - None - } - } - }; - - if reset_window { - // Reset the value for this entity - token.put_value(entity_index, None)?; - }; - - Ok(value_to_emit) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate( - token: &mut StringAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &StringArray = downcast_string_array(input.as_ref())?; - - // Make sure the internal buffers are large enough for the accumulators we may - // want to store. - Self::ensure_entity_capacity(token, key_capacity); - - let result: StringArray = if let Some(input_valid_bits) = input.nulls() { - izip!(key_indices.values(), input_valid_bits, 0..) - .map(|(entity_index, input_is_valid, input_index)| { - Self::update_accum( - token, - *entity_index, - input_is_valid, - input.value(input_index), - ) - }) - .collect::>()? - } else { - izip!(key_indices.values(), 0..) - .map(|(entity_index, input_index)| { - Self::update_accum(token, *entity_index, true, input.value(input_index)) - }) - .collect::>()? - }; - - Ok(Arc::new(result)) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Window Behavior - /// This aggregation uses the `since` window behavior, which takes a single - /// predicate. If the predicate evaluates to true, the accumulated value is - /// reset. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate_since( - token: &mut StringAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - window_since: &BooleanArray, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &StringArray = downcast_string_array(input.as_ref())?; - - // Make sure the internal buffers are large enough for the accumulators we may - // want to store. - Self::ensure_entity_capacity(token, key_capacity); - - let result: StringArray = match (input.nulls(), window_since.nulls()) { - (None, None) => izip!(key_indices.values(), 0.., window_since.values().iter()) - .map(|(entity_index, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - true, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), None) => izip!( - key_indices.values(), - input_valid_bits, - 0.., - window_since.values().iter() - ) - .map(|(entity_index, input_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - input_is_valid, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (None, Some(window_valid_bits)) => izip!( - key_indices.values(), - window_valid_bits, - 0.., - window_since.values().iter() - ) - .map(|(entity_index, since_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - true, - since_is_valid, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), Some(window_valid_bits)) => izip!( - key_indices.values(), - input_valid_bits, - window_valid_bits, - 0.., - window_since.values().iter() - ) - .map( - |(entity_index, input_is_valid, since_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - input_is_valid, - since_is_valid, - input.value(input_index), - since_bool, - ) - }, - ) - .collect::>()?, - }; - - Ok(Arc::new(result)) - } -} - -#[cfg(test)] -mod tests { - - use super::*; - - /// Size of chunks used during aggregation. 64 aligns with the size of - /// chunks in arrow's null-bit vector. - const CHUNK_SIZE: usize = 64; - - #[test] - fn test_string_first_with_multiple_chunks() { - let entity_indices = UInt32Array::from(vec![0; CHUNK_SIZE + 1]); - let input: Vec = (0..CHUNK_SIZE + 1).map(|i| i.to_string()).collect(); - let input: Vec<&str> = input.iter().map(|i| i.as_str()).collect(); - let input: ArrayRef = Arc::new(StringArray::from(input)); - let mut token = StringAccumToken::default(); - let output = - FirstStringEvaluator::aggregate(&mut token, 1, &entity_indices, &input).unwrap(); - - let expected = StringArray::from(vec!["0"; CHUNK_SIZE + 1]); - assert_eq!( - downcast_string_array::(output.as_ref()).unwrap(), - &expected - ); - } - - #[test] - fn test_string_first_with_no_null() { - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1, 0]); - let input: ArrayRef = Arc::new(StringArray::from(vec![ - Some("phone"), - Some("hello"), - Some("world"), - Some("monday"), - Some("dog"), - Some("apple"), - ])); - let mut token = StringAccumToken::default(); - let output = - FirstStringEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); - - assert_eq!( - downcast_string_array(output.as_ref()).unwrap(), - &StringArray::from(vec![ - Some("phone"), - Some("hello"), - Some("world"), - Some("hello"), - Some("hello"), - Some("phone") - ]) - ); - } - - #[test] - fn test_string_first_with_null() { - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1, 0]); - - let input: ArrayRef = Arc::new(StringArray::from(vec![ - None, - Some("okay"), - None, - None, - Some("dog"), - Some("apple"), - ])); - let mut token = StringAccumToken::default(); - let output = - FirstStringEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); - - assert_eq!( - downcast_string_array(output.as_ref()).unwrap(), - &StringArray::from(vec![ - None, - Some("okay"), - None, - Some("okay"), - Some("okay"), - Some("apple") - ]) - ); - - // And another round (to make sure values carry over) - let entity_indices = UInt32Array::from(vec![0, 1, 1, 2, 3, 0]); - let input: ArrayRef = Arc::new(StringArray::from(vec![ - Some("cat"), - Some("light"), - None, - Some("dark"), - None, - Some("outside"), - ])); - let output = - FirstStringEvaluator::aggregate(&mut token, 4, &entity_indices, &input).unwrap(); - - assert_eq!( - downcast_string_array(output.as_ref()).unwrap(), - &StringArray::from(vec![ - Some("apple"), - Some("okay"), - Some("okay"), - Some("dark"), - None, - Some("apple") - ]) - ); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/string/last_string_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/string/last_string_evaluator.rs deleted file mode 100644 index 189a108e9..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/string/last_string_evaluator.rs +++ /dev/null @@ -1,467 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use arrow::array::{Array, ArrayRef, BooleanArray, StringArray, UInt32Array}; -use itertools::izip; -use sparrow_arrow::downcast::downcast_string_array; - -use super::two_stacks_last_string_evaluator::TwoStacksLastStringEvaluator; -use crate::{ - AggregationArgs, Evaluator, EvaluatorFactory, RuntimeInfo, StateToken, StaticInfo, - StringAccumToken, TwoStacksStringAccumToken, -}; - -/// Evaluator for the `last` instruction on strings. -pub struct LastStringEvaluator { - args: AggregationArgs, - token: StringAccumToken, -} - -impl Evaluator for LastStringEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::NoWindow { input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let result = Self::aggregate( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ); - - result - } - AggregationArgs::Since { ticks, input } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let ticks = info.value(ticks)?.boolean_array()?; - let result = Self::aggregate_since( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - ticks.as_ref(), - ); - - result - } - AggregationArgs::Sliding { .. } => { - unreachable!("Expected Non-windowed or Since windowed aggregation, saw Sliding.") - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl EvaluatorFactory for LastStringEvaluator { - fn try_new(info: StaticInfo<'_>) -> anyhow::Result> { - let args = AggregationArgs::from_input(info.args)?; - match args { - AggregationArgs::NoWindow { .. } | AggregationArgs::Since { .. } => { - let token = StringAccumToken::default(); - Ok(Box::new(Self { token, args })) - } - AggregationArgs::Sliding { .. } => { - let token = TwoStacksStringAccumToken::new(); - Ok(Box::new(TwoStacksLastStringEvaluator { token, args })) - } - } - } -} - -impl LastStringEvaluator { - fn ensure_entity_capacity(token: &mut StringAccumToken, entity_id_len: usize) { - token.resize(entity_id_len); - } - - /// Updates the non-windowed accumulator based on the given flags. - /// - /// Implements a single row of the logic so that we can easily reuse it. - /// We choose to inline this so that it can be specialized in cases where - /// the valid bits are always true. - #[inline] - fn update_accum( - token: &mut StringAccumToken, - entity_index: u32, - input_is_valid: bool, - input: &str, - ) -> anyhow::Result> { - let value_to_emit = if input_is_valid { - token.put_value(entity_index, Some(input.to_string()))?; - Some(input.to_string()) - } else { - token.get_value(entity_index)? - }; - - Ok(value_to_emit) - } - - /// Updates the since-windowed accumulator based on the given flags. - /// - /// Accumulator behavior is to update -> emit -> reset, resulting in - /// exclusive start bounds and inclusive end bounds. - /// - /// Implements a single row of the logic so that we can easily reuse it. - /// We choose to inline this so that it can be specialized in cases where - /// the valid bits are always true. - #[inline] - fn update_since_accum( - token: &mut StringAccumToken, - entity_index: u32, - input_is_valid: bool, - since_is_valid: bool, - input: &str, - since_bool: bool, - ) -> anyhow::Result> { - let reset_window = since_is_valid && since_bool; - let value_to_emit = if input_is_valid && reset_window { - // Here we know we're going to reset the window, so don't bother attempting to - // put a new value, since it'll be set to `None` afterwards. - Some(input.to_string()) - } else if input_is_valid { - // Input is valid and no reset, so put the new value. - token.put_value(entity_index, Some(input.to_string()))?; - Some(input.to_string()) - } else { - // Input is not valid, so grab the current latest value. - token.get_value(entity_index)? - }; - - if reset_window { - // Reset the value for this entity - token.put_value(entity_index, None)?; - }; - - Ok(value_to_emit) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate( - token: &mut StringAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &StringArray = downcast_string_array(input.as_ref())?; - - // Make sure the internal buffers are large enough for the accumulators we may - // want to store. - Self::ensure_entity_capacity(token, key_capacity); - - let result: StringArray = if let Some(input_valid_bits) = input.nulls() { - izip!(key_indices.values(), input_valid_bits, 0..) - .map(|(entity_index, input_is_valid, input_index)| { - Self::update_accum( - token, - *entity_index, - input_is_valid, - input.value(input_index), - ) - }) - .collect::>()? - } else { - izip!(key_indices.values(), 0..) - .map(|(entity_index, input_index)| { - Self::update_accum(token, *entity_index, true, input.value(input_index)) - }) - .collect::>()? - }; - - Ok(Arc::new(result)) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Window Behavior - /// This aggregation uses the `since` window behavior, which takes a single - /// predicate. If the predicate evaluates to true, the accumulated value is - /// reset. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate_since( - token: &mut StringAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - window_since: &BooleanArray, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &StringArray = downcast_string_array(input.as_ref())?; - - // Make sure the internal buffers are large enough for the accumulators we may - // want to store. - Self::ensure_entity_capacity(token, key_capacity); - - let result: StringArray = match (input.nulls(), window_since.nulls()) { - (None, None) => izip!(key_indices.values(), 0.., window_since.values().iter()) - .map(|(entity_index, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - true, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), None) => izip!( - key_indices.values(), - input_valid_bits, - 0.., - window_since.values().iter() - ) - .map(|(entity_index, input_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - input_is_valid, - true, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (None, Some(window_valid_bits)) => izip!( - key_indices.values(), - window_valid_bits, - 0.., - window_since.values().iter() - ) - .map(|(entity_index, since_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - true, - since_is_valid, - input.value(input_index), - since_bool, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), Some(window_valid_bits)) => izip!( - key_indices.values(), - input_valid_bits, - window_valid_bits, - 0.., - window_since.values().iter() - ) - .map( - |(entity_index, input_is_valid, since_is_valid, input_index, since_bool)| { - Self::update_since_accum( - token, - *entity_index, - input_is_valid, - since_is_valid, - input.value(input_index), - since_bool, - ) - }, - ) - .collect::>()?, - }; - - Ok(Arc::new(result)) - } -} - -#[cfg(test)] -mod tests { - - use arrow::array::{ArrayRef, BooleanArray}; - - use super::*; - - /// Size of chunks used during aggregation. 64 aligns with the size of - /// chunks in arrow's null-bit vector. - const CHUNK_SIZE: usize = 64; - - #[test] - fn test_string_last_with_multiple_chunks() { - let entity_indices = UInt32Array::from(vec![0; CHUNK_SIZE + 1]); - let input: Vec = (0..CHUNK_SIZE + 1).map(|i| i.to_string()).collect(); - let input: Vec<&str> = input.iter().map(|i| i.as_str()).collect(); - let input: ArrayRef = Arc::new(StringArray::from(input)); - - let mut token = StringAccumToken::default(); - let output = - LastStringEvaluator::aggregate(&mut token, 1, &entity_indices, &input).unwrap(); - - assert_eq!( - downcast_string_array::(output.as_ref()).unwrap(), - downcast_string_array::(input.as_ref()).unwrap() - ); - } - - #[test] - fn test_string_last_with_no_null() { - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); - let input: ArrayRef = Arc::new(StringArray::from(vec![ - Some("phone"), - Some("hello"), - Some("world"), - Some("monday"), - Some("dog"), - ])); - let mut token = StringAccumToken::default(); - let output = - LastStringEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); - - assert_eq!( - downcast_string_array(output.as_ref()).unwrap(), - &StringArray::from(vec![ - Some("phone"), - Some("hello"), - Some("world"), - Some("monday"), - Some("dog") - ]) - ); - } - - #[test] - fn test_string_last_with_null() { - let entity_indices = UInt32Array::from(vec![0, 1, 2, 1, 1]); - let input: ArrayRef = Arc::new(StringArray::from(vec![ - Some("phone"), - Some("hello"), - None, - None, - Some("dog"), - ])); - let mut token = StringAccumToken::default(); - let output = - LastStringEvaluator::aggregate(&mut token, 3, &entity_indices, &input).unwrap(); - - assert_eq!( - downcast_string_array(output.as_ref()).unwrap(), - &StringArray::from(vec![ - Some("phone"), - Some("hello"), - None, - Some("hello"), - Some("dog") - ]) - ); - - // And another round (to make sure values carry over) - let entity_indices = UInt32Array::from(vec![0, 1, 1, 2, 3]); - let input: ArrayRef = Arc::new(StringArray::from(vec![ - None, - None, - Some("second"), - None, - None, - ])); - let output = - LastStringEvaluator::aggregate(&mut token, 4, &entity_indices, &input).unwrap(); - - assert_eq!( - downcast_string_array(output.as_ref()).unwrap(), - &StringArray::from(vec![Some("phone"), Some("dog"), Some("second"), None, None,]) - ); - } - - #[test] - fn test_string_last_since_with_null() { - let entity_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); - let input: ArrayRef = Arc::new(StringArray::from(vec![ - Some("phone"), - Some("hello"), - None, - None, - Some("dog"), - ])); - - let since = BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - Some(false), - ]); - - let mut token = StringAccumToken::default(); - let output = - LastStringEvaluator::aggregate_since(&mut token, 3, &entity_indices, &input, &since) - .unwrap(); - - assert_eq!( - downcast_string_array(output.as_ref()).unwrap(), - &StringArray::from(vec![ - Some("phone"), - Some("hello"), - Some("hello"), - Some("hello"), - Some("dog") - ]) - ); - - // And another round (to make sure values carry over) - let entity_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); - let input: ArrayRef = Arc::new(StringArray::from(vec![ - None, - None, - Some("second"), - None, - None, - ])); - let since = BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - Some(true), - Some(false), - ]); - let output = - LastStringEvaluator::aggregate_since(&mut token, 4, &entity_indices, &input, &since) - .unwrap(); - - assert_eq!( - downcast_string_array(output.as_ref()).unwrap(), - &StringArray::from(vec![ - Some("dog"), - Some("dog"), - Some("second"), - Some("second"), - None, - ]) - ); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/string/two_stacks_first_string_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/string/two_stacks_first_string_evaluator.rs deleted file mode 100644 index ab44b175d..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/string/two_stacks_first_string_evaluator.rs +++ /dev/null @@ -1,269 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use anyhow::anyhow; -use arrow::array::{Array, ArrayRef, BooleanArray, StringArray, UInt32Array}; -use arrow::datatypes::Int64Type; -use itertools::izip; -use sparrow_arrow::downcast::downcast_string_array; - -use crate::{ - AggregationArgs, Evaluator, FirstString, RuntimeInfo, StateToken, TwoStacks, - TwoStacksStringAccumToken, -}; - -/// Evaluator for the `First` instruction on strings. -pub(crate) struct TwoStacksFirstStringEvaluator { - pub args: AggregationArgs, - pub token: TwoStacksStringAccumToken, -} - -impl Evaluator for TwoStacksFirstStringEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::Sliding { - input, - ticks, - duration, - } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let ticks = info.value(ticks)?.boolean_array()?; - let duration = info - .value(duration)? - .try_primitive_literal::()? - .ok_or_else(|| anyhow!("Expected non-null literal duration"))?; - if duration <= 0 { - anyhow::bail!( - "Expected positive duration for sliding window, saw {:?}", - duration - ); - } - let result = Self::aggregate( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - duration, - ticks.as_ref(), - ); - - result - } - AggregationArgs::Since { .. } | AggregationArgs::NoWindow { .. } => { - unreachable!( - "Expected sliding-windowed aggregation, saw non-windowed or since windowed." - ) - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl TwoStacksFirstStringEvaluator { - fn ensure_entity_capacity( - token: &mut TwoStacksStringAccumToken, - entity_capacity: usize, - window_parts: i64, - ) { - token.resize(entity_capacity, window_parts); - } - - /// Updates the windowed accumulator based on the given flags. - /// - /// Accumulator behavior is to update -> emit -> reset, resulting in - /// exclusive start bounds and inclusive end bounds. - #[inline] - #[allow(clippy::too_many_arguments)] - fn update_two_stacks_accum( - token: &mut TwoStacksStringAccumToken, - entity_index: u32, - input_is_valid: bool, - sliding_is_valid: bool, - input: &str, - sliding: bool, - initial_windows: i64, - ) -> anyhow::Result> { - let evict_window = sliding_is_valid && sliding; - - let mut accum = match token.get_value(entity_index)? { - Some(accum) => accum, - None => TwoStacks::new(initial_windows), - }; - - if input_is_valid { - accum.add_input(&input.to_string()); - }; - - let value_to_emit = accum.accum_value(); - - if evict_window { - accum.evict(); - } - - token.put_value(entity_index, accum)?; - Ok(value_to_emit) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Window Behavior - /// This aggregation uses the `sliding` window behavior. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate( - token: &mut TwoStacksStringAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - sliding_duration: i64, - sliding_window: &BooleanArray, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &StringArray = downcast_string_array(input.as_ref())?; - - Self::ensure_entity_capacity(token, key_capacity, sliding_duration); - - let result: StringArray = match (input.nulls(), sliding_window.nulls()) { - (None, None) => izip!(key_indices.values(), 0.., sliding_window.values().iter()) - .map(|(entity_index, input_index, since_bool)| { - Self::update_two_stacks_accum( - token, - *entity_index, - true, - true, - input.value(input_index), - since_bool, - sliding_duration, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), None) => izip!( - key_indices.values(), - input_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map(|(entity_index, input_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - token, - *entity_index, - input_is_valid, - true, - input.value(input_index), - since_bool, - sliding_duration, - ) - }) - .collect::>()?, - - (None, Some(window_valid_bits)) => izip!( - key_indices.values(), - window_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map(|(entity_index, since_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - token, - *entity_index, - true, - since_is_valid, - input.value(input_index), - since_bool, - sliding_duration, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), Some(window_valid_bits)) => izip!( - key_indices.values(), - input_valid_bits, - window_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map( - |(entity_index, input_is_valid, since_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - token, - *entity_index, - input_is_valid, - since_is_valid, - input.value(input_index), - since_bool, - sliding_duration, - ) - }, - ) - .collect::>()?, - }; - - Ok(Arc::new(result)) - } -} - -#[cfg(test)] -mod tests { - - use super::*; - - #[test] - fn test_sliding_string_first_with_no_null() { - let entity_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); - let input: ArrayRef = Arc::new(StringArray::from(vec![ - Some("phone"), - Some("hello"), - Some("world"), - Some("monday"), - Some("dog"), - ])); - let sliding = BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - Some(true), - Some(false), - ]); - - let mut token = TwoStacksStringAccumToken::new(); - let output = TwoStacksFirstStringEvaluator::aggregate( - &mut token, - 1, - &entity_indices, - &input, - 2, - &sliding, - ) - .unwrap(); - assert_eq!( - downcast_string_array(output.as_ref()).unwrap(), - &StringArray::from(vec![ - Some("phone"), - Some("phone"), - Some("hello"), - Some("hello"), - Some("world") - ]) - ); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/string/two_stacks_last_string_evaluator.rs b/crates/sparrow-instructions/src/evaluators/aggregation/string/two_stacks_last_string_evaluator.rs deleted file mode 100644 index bfb0eb6e9..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/string/two_stacks_last_string_evaluator.rs +++ /dev/null @@ -1,268 +0,0 @@ -use std::sync::Arc; - -use crate::ValueRef; -use anyhow::anyhow; -use arrow::array::{Array, ArrayRef, BooleanArray, StringArray, UInt32Array}; -use arrow::datatypes::Int64Type; -use itertools::izip; -use sparrow_arrow::downcast::downcast_string_array; - -use crate::{ - AggregationArgs, Evaluator, LastString, RuntimeInfo, StateToken, TwoStacks, - TwoStacksStringAccumToken, -}; - -/// Evaluator for the `last` instruction on strings. -pub(crate) struct TwoStacksLastStringEvaluator { - pub args: AggregationArgs, - pub token: TwoStacksStringAccumToken, -} - -impl Evaluator for TwoStacksLastStringEvaluator { - fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - match &self.args { - AggregationArgs::Sliding { - input, - ticks, - duration, - } => { - let grouping = info.grouping(); - let input_vals = info.value(input)?.array_ref()?; - let ticks = info.value(ticks)?.boolean_array()?; - let duration = info - .value(duration)? - .try_primitive_literal::()? - .ok_or_else(|| anyhow!("Expected non-null literal duration"))?; - if duration <= 0 { - anyhow::bail!( - "Expected positive duration for sliding window, saw {:?}", - duration - ); - } - let result = Self::aggregate( - &mut self.token, - grouping.num_groups(), - grouping.group_indices(), - &input_vals, - duration, - ticks.as_ref(), - ); - - result - } - AggregationArgs::Since { .. } | AggregationArgs::NoWindow { .. } => { - unreachable!( - "Expected sliding-windowed aggregation, saw non-windowed or since windowed." - ) - } - } - } - - fn state_token(&self) -> Option<&dyn StateToken> { - Some(&self.token) - } - - fn state_token_mut(&mut self) -> Option<&mut dyn StateToken> { - Some(&mut self.token) - } -} - -impl TwoStacksLastStringEvaluator { - fn ensure_entity_capacity( - token: &mut TwoStacksStringAccumToken, - entity_capacity: usize, - window_parts: i64, - ) { - token.resize(entity_capacity, window_parts); - } - - /// Updates the windowed accumulator based on the given flags. - /// - /// Accumulator behavior is to update -> emit -> reset, resulting in - /// exclusive start bounds and inclusive end bounds. - #[inline] - #[allow(clippy::too_many_arguments)] - fn update_two_stacks_accum( - token: &mut TwoStacksStringAccumToken, - entity_index: u32, - input_is_valid: bool, - sliding_is_valid: bool, - input: &str, - sliding: bool, - initial_windows: i64, - ) -> anyhow::Result> { - let evict_window = sliding_is_valid && sliding; - - let mut accum = match token.get_value(entity_index)? { - Some(accum) => accum, - None => TwoStacks::new(initial_windows), - }; - - if input_is_valid { - accum.add_input(&input.to_string()); - }; - - let value_to_emit = accum.accum_value(); - - if evict_window { - accum.evict(); - } - - token.put_value(entity_index, accum)?; - Ok(value_to_emit) - } - - /// Update the aggregation state with the given inputs and return the - /// aggregation. - /// - /// The `key_capacity` must be greater than all values in the - /// `entity_indices`. - /// - /// # Window Behavior - /// This aggregation uses the `sliding` window behavior. - /// - /// # Result - /// The result is an array containing the result of the aggregation for each - /// input row. - /// - /// # Assumptions - /// This assumes that the input data has been sorted by occurrence time. - /// Specifically, no checking is done to ensure that elements appear in the - /// appropriate order. - fn aggregate( - token: &mut TwoStacksStringAccumToken, - key_capacity: usize, - key_indices: &UInt32Array, - input: &ArrayRef, - sliding_duration: i64, - sliding_window: &BooleanArray, - ) -> anyhow::Result { - assert_eq!(key_indices.len(), input.len()); - let input: &StringArray = downcast_string_array(input.as_ref())?; - - Self::ensure_entity_capacity(token, key_capacity, sliding_duration); - - let result: StringArray = match (input.nulls(), sliding_window.nulls()) { - (None, None) => izip!(key_indices.values(), 0.., sliding_window.values().iter()) - .map(|(entity_index, input_index, since_bool)| { - Self::update_two_stacks_accum( - token, - *entity_index, - true, - true, - input.value(input_index), - since_bool, - sliding_duration, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), None) => izip!( - key_indices.values(), - input_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map(|(entity_index, input_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - token, - *entity_index, - input_is_valid, - true, - input.value(input_index), - since_bool, - sliding_duration, - ) - }) - .collect::>()?, - - (None, Some(window_valid_bits)) => izip!( - key_indices.values(), - window_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map(|(entity_index, since_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - token, - *entity_index, - true, - since_is_valid, - input.value(input_index), - since_bool, - sliding_duration, - ) - }) - .collect::>()?, - - (Some(input_valid_bits), Some(window_valid_bits)) => izip!( - key_indices.values(), - input_valid_bits, - window_valid_bits, - 0.., - sliding_window.values().iter() - ) - .map( - |(entity_index, input_is_valid, since_is_valid, input_index, since_bool)| { - Self::update_two_stacks_accum( - token, - *entity_index, - input_is_valid, - since_is_valid, - input.value(input_index), - since_bool, - sliding_duration, - ) - }, - ) - .collect::>()?, - }; - - Ok(Arc::new(result)) - } -} -#[cfg(test)] -mod tests { - - use super::*; - #[test] - fn test_sliding_string_last_with_no_null() { - let entity_indices = UInt32Array::from(vec![0, 0, 0, 0, 0]); - let input: ArrayRef = Arc::new(StringArray::from(vec![ - Some("phone"), - Some("hello"), - Some("world"), - Some("monday"), - Some("dog"), - ])); - let sliding = BooleanArray::from(vec![ - Some(true), - Some(false), - Some(false), - Some(true), - Some(false), - ]); - - let mut token = TwoStacksStringAccumToken::new(); - let output = TwoStacksLastStringEvaluator::aggregate( - &mut token, - 1, - &entity_indices, - &input, - 2, - &sliding, - ) - .unwrap(); - - assert_eq!( - downcast_string_array(output.as_ref()).unwrap(), - &StringArray::from(vec![ - Some("phone"), - Some("hello"), - Some("world"), - Some("monday"), - Some("dog") - ]) - ); - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/token.rs b/crates/sparrow-instructions/src/evaluators/aggregation/token.rs index 1489b5781..c1837436a 100644 --- a/crates/sparrow-instructions/src/evaluators/aggregation/token.rs +++ b/crates/sparrow-instructions/src/evaluators/aggregation/token.rs @@ -1,27 +1,17 @@ //! Tokens representing keys for compute storage. -mod boolean_accum_token; +mod array_ref_accum_token; mod collect_struct_token; mod collect_token; mod count_accum_token; -mod list_accum_token; -mod map_accum_token; mod primitive_accum_token; -mod string_accum_token; -mod two_stacks_boolean_accum_token; mod two_stacks_count_accum_token; mod two_stacks_primitive_accum_token; -mod two_stacks_string_accum_token; -pub use boolean_accum_token::*; +pub use array_ref_accum_token::*; pub use collect_struct_token::*; pub use collect_token::*; pub use count_accum_token::*; -pub use list_accum_token::*; -pub use map_accum_token::*; pub use primitive_accum_token::*; -pub use string_accum_token::*; -pub use two_stacks_boolean_accum_token::*; pub use two_stacks_count_accum_token::*; pub use two_stacks_primitive_accum_token::*; -pub use two_stacks_string_accum_token::*; diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/token/list_accum_token.rs b/crates/sparrow-instructions/src/evaluators/aggregation/token/array_ref_accum_token.rs similarity index 65% rename from crates/sparrow-instructions/src/evaluators/aggregation/token/list_accum_token.rs rename to crates/sparrow-instructions/src/evaluators/aggregation/token/array_ref_accum_token.rs index e955d73c9..1dde270d3 100644 --- a/crates/sparrow-instructions/src/evaluators/aggregation/token/list_accum_token.rs +++ b/crates/sparrow-instructions/src/evaluators/aggregation/token/array_ref_accum_token.rs @@ -1,22 +1,20 @@ -use arrow::array::{new_null_array, Array, ArrayRef, AsArray, MapArray}; +use arrow::array::{new_empty_array, new_null_array, Array, ArrayRef}; +use arrow_schema::DataType; use crate::{ComputeStore, StateToken, StoreKey}; -/// Token used for map accumulators -/// -/// Map accumulators are serialized as [ArrayRef], working directly with -/// Arrow. +/// Token used for accumualting an ArrayRef. #[derive(serde::Serialize, serde::Deserialize)] -pub struct ListAccumToken { +pub struct ArrayRefAccumToken { /// Stores the state for in-memory usage. #[serde(with = "sparrow_arrow::serde::array_ref")] pub accum: ArrayRef, } -impl StateToken for ListAccumToken { +impl StateToken for ArrayRefAccumToken { fn restore(&mut self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { if let Some(state) = store.get(key)? { - let state: ListAccumToken = state; + let state: ArrayRefAccumToken = state; self.accum = state.accum; }; Ok(()) @@ -27,13 +25,19 @@ impl StateToken for ListAccumToken { } } -impl ListAccumToken { +impl ArrayRefAccumToken { pub fn new(accum: ArrayRef) -> Self { Self { accum } } - pub fn array(&self) -> &MapArray { - self.accum.as_map() + pub fn empty(data_type: &DataType) -> Self { + Self { + accum: new_empty_array(data_type), + } + } + + pub fn array(&self) -> &dyn Array { + self.accum.as_ref() } /// Concat nulls to the end of the current accumulator to grow the size. @@ -41,14 +45,13 @@ impl ListAccumToken { let diff = len - self.accum.len(); let null_array = new_null_array(self.accum.data_type(), diff); - let null_array = null_array.as_ref().as_list::(); - let new_state = arrow::compute::concat(&[&self.accum, null_array])?; + let new_state = arrow::compute::concat(&[&self.accum, null_array.as_ref()])?; self.accum = new_state.clone(); Ok(()) } - pub fn value_is_null(&mut self, key: u32) -> bool { - self.accum.is_null(key as usize) + pub fn value_is_null(&mut self, key: usize) -> bool { + self.accum.is_null(key) } pub fn set_state(&mut self, new_state: ArrayRef) { diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/token/boolean_accum_token.rs b/crates/sparrow-instructions/src/evaluators/aggregation/token/boolean_accum_token.rs deleted file mode 100644 index 3ea3e6864..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/token/boolean_accum_token.rs +++ /dev/null @@ -1,108 +0,0 @@ -use bitvec::prelude::BitVec; -use serde::{Deserialize, Serialize}; - -use crate::{ComputeStore, StateToken, StoreKey}; - -/// Token used for boolean accumulators. -/// -/// Boolean accumulators are stored in two `BitVecs`, one validity bit -/// and one value bit. -#[derive(Default)] -pub struct BooleanAccumToken { - /// Serializable state of the accumulator. - accum: SerializableBooleanAccum, -} - -#[derive(Serialize, Deserialize, Default)] -struct SerializableBooleanAccum { - /// Stores the value. - accum: BitVec, - - /// Indicates whether the bit at the specified index in `accum` is valid. - is_valid: BitVec, -} - -impl SerializableBooleanAccum { - fn clear(&mut self) { - self.accum.clear(); - self.is_valid.clear(); - } - - fn resize(&mut self, len: usize) { - self.accum.resize(len, false); - self.is_valid.resize(len, false); - } - - fn is_set(&self, index: u32) -> bool { - self.is_valid[index as usize] - } - - fn set(&mut self, index: u32, value: bool) { - self.is_valid.set(index as usize, true); - self.accum.set(index as usize, value); - } - - fn unset(&mut self, index: u32) { - // No need to unset the `accum`, as the `is_valid` bit is marked as false. - self.is_valid.set(index as usize, false); - } - - pub(crate) fn get_optional_value(&mut self, entity_index: u32) -> anyhow::Result> { - if self.is_set(entity_index) { - Ok(Some(self.accum[entity_index as usize])) - } else { - Ok(None) - } - } -} - -impl StateToken for BooleanAccumToken { - fn restore(&mut self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { - if let Some(accum) = store.get(key)? { - self.accum = accum; - } else { - self.accum.clear(); - } - - Ok(()) - } - - fn store(&self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { - store.put(key, &self.accum) - } -} - -impl BooleanAccumToken { - pub(crate) fn resize(&mut self, len: usize) { - self.accum.resize(len); - } - - fn set(&mut self, index: u32, value: bool) { - self.accum.set(index, value); - } - - fn unset(&mut self, index: u32) { - self.accum.unset(index); - } - - pub(crate) fn get_optional_value(&mut self, entity_index: u32) -> anyhow::Result> { - if self.accum.is_set(entity_index) { - self.accum.get_optional_value(entity_index) - } else { - Ok(None) - } - } - - pub(crate) fn put_optional_value( - &mut self, - entity_index: u32, - value: Option, - ) -> anyhow::Result<()> { - if let Some(v) = value { - self.set(entity_index, v); - } else { - self.unset(entity_index); - } - Ok(()) - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/token/map_accum_token.rs b/crates/sparrow-instructions/src/evaluators/aggregation/token/map_accum_token.rs deleted file mode 100644 index 96e9fcb23..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/token/map_accum_token.rs +++ /dev/null @@ -1,53 +0,0 @@ -use arrow::array::{new_null_array, Array, ArrayRef, AsArray}; - -use crate::{ComputeStore, StateToken, StoreKey}; - -/// Token used for map accumulators -/// -/// Map accumulators are serialized as [ArrayRef], working directly with -/// Arrow. -#[derive(serde::Serialize, serde::Deserialize)] -pub struct MapAccumToken { - /// Stores the state for in-memory usage. - #[serde(with = "sparrow_arrow::serde::array_ref")] - pub accum: ArrayRef, -} - -impl StateToken for MapAccumToken { - fn restore(&mut self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { - if let Some(state) = store.get(key)? { - let state: MapAccumToken = state; - self.accum = state.accum; - }; - Ok(()) - } - - fn store(&self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { - store.put(key, &self) - } -} - -impl MapAccumToken { - pub fn new(accum: ArrayRef) -> Self { - Self { accum } - } - - /// Concat nulls to the end of the current accumulator to grow the size. - pub fn resize(&mut self, len: usize) -> anyhow::Result<()> { - let diff = len - self.accum.len(); - - let null_array = new_null_array(self.accum.data_type(), diff); - let null_array = null_array.as_ref().as_map(); - let new_state = arrow::compute::concat(&[&self.accum, null_array])?; - self.accum = new_state.clone(); - Ok(()) - } - - pub fn value_is_null(&mut self, key: u32) -> bool { - self.accum.is_null(key as usize) - } - - pub fn set_state(&mut self, new_state: ArrayRef) { - self.accum = new_state - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/token/string_accum_token.rs b/crates/sparrow-instructions/src/evaluators/aggregation/token/string_accum_token.rs deleted file mode 100644 index a28f769be..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/token/string_accum_token.rs +++ /dev/null @@ -1,35 +0,0 @@ -use crate::{ComputeStore, StateToken, StoreKey}; -/// Token used for string accumulators. -/// -/// String accumulators are stored as `[passId, instId, entity_index] -> -/// Option`. Values are updated entity-by-entity. -#[derive(Default)] -pub struct StringAccumToken { - /// Stores the state for in-memory usage. - accum: Vec>, -} - -impl StateToken for StringAccumToken { - fn restore(&mut self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { - store.get_to_vec(key, &mut self.accum) - } - - fn store(&self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { - store.put(key, &self.accum) - } -} - -impl StringAccumToken { - pub fn resize(&mut self, len: usize) { - self.accum.resize(len, None); - } - - pub fn get_value(&mut self, key: u32) -> anyhow::Result> { - Ok(self.accum[key as usize].clone()) - } - - pub fn put_value(&mut self, key: u32, value: Option) -> anyhow::Result<()> { - self.accum[key as usize] = value; - Ok(()) - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/token/two_stacks_boolean_accum_token.rs b/crates/sparrow-instructions/src/evaluators/aggregation/token/two_stacks_boolean_accum_token.rs deleted file mode 100644 index 6defd7bb9..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/token/two_stacks_boolean_accum_token.rs +++ /dev/null @@ -1,46 +0,0 @@ -use crate::aggregation::two_stacks::TwoStacks; -use crate::{AggFn, ComputeStore, StateToken, StoreKey}; - -/// Key used for windowed boolean accumulators using two-stacks -/// implementation. -/// -/// Stored as `[passId, instId, entity_index] -> TwoStacks` -pub struct TwoStacksBooleanAccumToken -where - AggF: AggFn, -{ - /// Stores the state for in-memory usage. - accum: Vec>, -} - -impl StateToken for TwoStacksBooleanAccumToken -where - AggF: AggFn, - Vec>: serde::ser::Serialize + serde::de::DeserializeOwned, -{ - fn restore(&mut self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { - store.get_to_vec(key, &mut self.accum) - } - - fn store(&self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { - store.put(key, &self.accum) - } -} - -impl TwoStacksBooleanAccumToken -where - AggF: AggFn, -{ - pub(crate) fn new() -> Self { - Self { accum: Vec::new() } - } - - pub(crate) fn get_boolean_accum(&mut self) -> anyhow::Result>> { - Ok(std::mem::take(&mut self.accum)) - } - - pub(crate) fn put_boolean_accum(&mut self, accum: Vec>) -> anyhow::Result<()> { - self.accum = accum; - Ok(()) - } -} diff --git a/crates/sparrow-instructions/src/evaluators/aggregation/token/two_stacks_string_accum_token.rs b/crates/sparrow-instructions/src/evaluators/aggregation/token/two_stacks_string_accum_token.rs deleted file mode 100644 index 223fa8126..000000000 --- a/crates/sparrow-instructions/src/evaluators/aggregation/token/two_stacks_string_accum_token.rs +++ /dev/null @@ -1,50 +0,0 @@ -use crate::aggregation::two_stacks::TwoStacks; -use crate::{AggFn, ComputeStore, StateToken, StoreKey}; - -/// Key used for windowed string accumulators using two-stacks -/// implementation. -/// -/// Stored as `[passId, instId, entity_index] -> TwoStacks` -pub struct TwoStacksStringAccumToken -where - AggF: AggFn, -{ - /// Stores the state. - accum: Vec>, -} - -impl StateToken for TwoStacksStringAccumToken -where - AggF: AggFn, - Vec>: serde::ser::Serialize + serde::de::DeserializeOwned, -{ - fn restore(&mut self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { - store.get_to_vec(key, &mut self.accum) - } - - fn store(&self, key: &StoreKey, store: &ComputeStore) -> anyhow::Result<()> { - store.put(key, &self.accum) - } -} - -impl TwoStacksStringAccumToken -where - AggF: AggFn, -{ - pub(crate) fn new() -> Self { - Self { accum: Vec::new() } - } - - pub(crate) fn resize(&mut self, len: usize, initial_windows: i64) { - self.accum.resize(len, TwoStacks::new(initial_windows)); - } - - pub(crate) fn get_value(&mut self, key: u32) -> anyhow::Result>> { - Ok(Some(self.accum[key as usize].clone())) - } - - pub(crate) fn put_value(&mut self, key: u32, input: TwoStacks) -> anyhow::Result<()> { - self.accum[key as usize] = input; - Ok(()) - } -} diff --git a/crates/sparrow-main/tests/e2e/aggregation_tests.rs b/crates/sparrow-main/tests/e2e/aggregation_tests.rs index 73469f06c..d523cd2ea 100644 --- a/crates/sparrow-main/tests/e2e/aggregation_tests.rs +++ b/crates/sparrow-main/tests/e2e/aggregation_tests.rs @@ -446,6 +446,7 @@ async fn test_first_timestamp_ns() { "###); } +#[ignore = "https://github.com/kaskada-ai/kaskada/issues/754"] #[tokio::test] async fn test_last_sliding_i64() { insta::assert_snapshot!(QueryFixture::new("{ last: last(Numbers.m, window=sliding(3, monthly())) }").run_to_csv(&i64_data_fixture().await).await.unwrap(), @r###" diff --git a/crates/sparrow-main/tests/e2e/windowed_aggregation_tests.rs b/crates/sparrow-main/tests/e2e/windowed_aggregation_tests.rs index 67f621087..269d48684 100644 --- a/crates/sparrow-main/tests/e2e/windowed_aggregation_tests.rs +++ b/crates/sparrow-main/tests/e2e/windowed_aggregation_tests.rs @@ -286,6 +286,7 @@ async fn test_first_since_window_emits_value_on_reset() { /// Verifies that when a sliding window is evicted, the oldest value is emitted, /// regardless of input validity. +#[ignore = "https://github.com/kaskada-ai/kaskada/issues/754"] #[tokio::test] async fn test_first_sliding_window_emits_value_on_reset() { insta::assert_snapshot!(QueryFixture::new("{ n: Foo.n, first_sliding: first(Foo.n, window=sliding(2, is_valid(Foo))) }").run_to_csv(&window_data_fixture_with_nulls().await).await.unwrap(), @r###" @@ -320,6 +321,7 @@ async fn test_last_since_window_emits_value_on_reset() { /// Verifies that when a sliding window is evicted, the oldest value is emitted, /// regardless of input validity. +#[ignore = "https://github.com/kaskada-ai/kaskada/issues/754"] #[tokio::test] async fn test_last_sliding_window_emits_value_on_reset() { insta::assert_snapshot!(QueryFixture::new("{ n: Foo.n, last_sliding: last(Foo.n, window=sliding(2, is_valid(Foo))) }").run_to_csv(&window_data_fixture_with_nulls().await).await.unwrap(), @r###" @@ -395,6 +397,7 @@ async fn test_sum_sliding_every_few_events() { "###); } +#[ignore = "https://github.com/kaskada-ai/kaskada/issues/754"] #[tokio::test] async fn test_first_f64_sliding_every_few_events() { insta::assert_snapshot!(QueryFixture::new("{ sliding_first: first(Foo.n, window=sliding(2, is_valid(Foo))) }").run_to_csv(&window_data_fixture().await).await.unwrap(), @r###" @@ -410,6 +413,7 @@ async fn test_first_f64_sliding_every_few_events() { "###); } +#[ignore = "https://github.com/kaskada-ai/kaskada/issues/754"] #[tokio::test] async fn test_first_string_sliding_every_few_events() { insta::assert_snapshot!(QueryFixture::new("{ sliding_first: first(Foo.vegetable, window=sliding(2, is_valid(Foo))) }").run_to_csv(&window_data_fixture().await).await.unwrap(), @r###" @@ -425,6 +429,7 @@ async fn test_first_string_sliding_every_few_events() { "###); } +#[ignore = "https://github.com/kaskada-ai/kaskada/issues/754"] #[tokio::test] async fn test_first_boolean_sliding_every_few_events() { insta::assert_snapshot!(QueryFixture::new("{ sliding_first: first(Foo.bool, window=sliding(2, is_valid(Foo))) }").run_to_csv(&window_data_fixture().await).await.unwrap(), @r###" @@ -440,6 +445,7 @@ async fn test_first_boolean_sliding_every_few_events() { "###); } +#[ignore = "https://github.com/kaskada-ai/kaskada/issues/754"] #[tokio::test] async fn test_last_f64_sliding_every_few_events() { insta::assert_snapshot!(QueryFixture::new("{ sliding_last: last(Foo.n, window=sliding(2, is_valid(Foo))) }").run_to_csv(&window_data_fixture().await).await.unwrap(), @r###" @@ -454,6 +460,8 @@ async fn test_last_f64_sliding_every_few_events() { 1996-12-20T00:40:04.000000000,9223372036854775808,12960666915911099378,A,10.0 "###); } + +#[ignore = "https://github.com/kaskada-ai/kaskada/issues/754"] #[tokio::test] async fn test_last_string_sliding_every_few_events() { insta::assert_snapshot!(QueryFixture::new("{ sliding_last: last(Foo.vegetable, window=sliding(2, is_valid(Foo))) }").run_to_csv(&window_data_fixture().await).await.unwrap(), @r###" @@ -469,6 +477,7 @@ async fn test_last_string_sliding_every_few_events() { "###); } +#[ignore = "https://github.com/kaskada-ai/kaskada/issues/754"] #[tokio::test] async fn test_last_bool_sliding_every_few_events() { insta::assert_snapshot!(QueryFixture::new("{ sliding_last: last(Foo.bool, window=sliding(2, is_valid(Foo))) }").run_to_csv(&window_data_fixture().await).await.unwrap(), @r###"