diff --git a/.github/workflows/ci_python.yml b/.github/workflows/ci_python.yml index 0f43c6e22..3c8a89803 100644 --- a/.github/workflows/ci_python.yml +++ b/.github/workflows/ci_python.yml @@ -310,7 +310,11 @@ jobs: uses: baptiste0928/cargo-install@v2 with: crate: cargo-machete - - name: Machete + - name: Machete (Sparrow) + working-directory: + run: + cargo machete + - name: Machete (Python) run: cargo machete diff --git a/Cargo.lock b/Cargo.lock index 4c8b337ac..deca22691 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4271,6 +4271,7 @@ dependencies = [ "smallvec", "sparrow-arrow", "sparrow-expressions", + "sparrow-interfaces", "sparrow-logical", "sparrow-physical", "static_init", @@ -4382,6 +4383,21 @@ dependencies = [ "tonic 0.9.2", ] +[[package]] +name = "sparrow-expr-execution" +version = "0.11.0" +dependencies = [ + "approx 0.5.1", + "arrow-array", + "arrow-schema", + "error-stack", + "index_vec", + "sparrow-batch", + "sparrow-expressions", + "sparrow-interfaces", + "sparrow-physical", +] + [[package]] name = "sparrow-expressions" version = "0.11.0" @@ -4396,18 +4412,13 @@ dependencies = [ "arrow-schema", "arrow-select", "arrow-string", - "derive_more", "error-stack", - "hashbrown 0.14.0", - "index_vec", "inventory", "itertools 0.11.0", "num", "serde_json", "sparrow-arrow", - "sparrow-batch", - "sparrow-physical", - "static_init", + "sparrow-interfaces", "substring", ] @@ -4455,11 +4466,17 @@ dependencies = [ name = "sparrow-interfaces" version = "0.11.0" dependencies = [ + "arrow-array", "arrow-schema", "derive_more", "error-stack", "futures", + "hashbrown 0.14.0", + "inventory", + "itertools 0.11.0", + "sparrow-arrow", "sparrow-batch", + "static_init", ] [[package]] @@ -4607,6 +4624,7 @@ dependencies = [ "parking_lot 0.12.1", "sparrow-backend", "sparrow-batch", + "sparrow-expressions", "sparrow-interfaces", "sparrow-logical", "sparrow-physical", @@ -4733,7 +4751,6 @@ dependencies = [ "error-stack", "hashbrown 0.14.0", "index_vec", - "itertools 0.11.0", "loom", "parking_lot 0.12.1", "serde", @@ -4760,7 +4777,6 @@ dependencies = [ "sparrow-api", "sparrow-backend", "sparrow-compiler", - "sparrow-expressions", "sparrow-instructions", "sparrow-interfaces", "sparrow-logical", @@ -4847,6 +4863,7 @@ dependencies = [ "parking_lot 0.12.1", "sparrow-arrow", "sparrow-batch", + "sparrow-expr-execution", "sparrow-expressions", "sparrow-physical", "sparrow-scheduler", diff --git a/crates/sparrow-backend/Cargo.toml b/crates/sparrow-backend/Cargo.toml index 77d59af5c..c14c32d08 100644 --- a/crates/sparrow-backend/Cargo.toml +++ b/crates/sparrow-backend/Cargo.toml @@ -20,7 +20,7 @@ index_vec.workspace = true itertools.workspace = true smallvec.workspace = true sparrow-arrow = { path = "../sparrow-arrow" } -sparrow-expressions = { path = "../sparrow-expressions" } +sparrow-interfaces = { path = "../sparrow-interfaces" } sparrow-logical = { path = "../sparrow-logical" } sparrow-physical = { path = "../sparrow-physical" } uuid.workspace = true @@ -29,6 +29,7 @@ tracing.workspace = true [dev-dependencies] insta.workspace = true +sparrow-expressions = { path = "../sparrow-expressions" } [lib] doctest = false diff --git a/crates/sparrow-backend/src/exprs/expr_lang.rs b/crates/sparrow-backend/src/exprs/expr_lang.rs index b1cbfb10a..4cc6b0d4d 100644 --- a/crates/sparrow-backend/src/exprs/expr_lang.rs +++ b/crates/sparrow-backend/src/exprs/expr_lang.rs @@ -62,7 +62,7 @@ impl egg::FromOp for ExprLang { type Error = error_stack::Report; fn from_op(op: &str, children: Vec) -> Result { - let name = sparrow_expressions::intern_name(op) + let name = sparrow_interfaces::expression::intern_name(op) .ok_or_else(|| Error::NoSuchInstruction(op.to_owned()))?; let args = SmallVec::from_vec(children); diff --git a/crates/sparrow-backend/src/logical_to_physical.rs b/crates/sparrow-backend/src/logical_to_physical.rs index 46bc9393d..56c3d92d9 100644 --- a/crates/sparrow-backend/src/logical_to_physical.rs +++ b/crates/sparrow-backend/src/logical_to_physical.rs @@ -255,7 +255,7 @@ impl LogicalToPhysical { Ok(input) } other => { - let Some(instruction) = sparrow_expressions::intern_name(other) else { + let Some(instruction) = sparrow_interfaces::expression::intern_name(other) else { error_stack::bail!(Error::invalid_logical_plan(format!( "unknown instruction '{other}' in logical plan" ))); @@ -402,6 +402,8 @@ mod tests { #[test] fn test_logical_to_physical_arithmetic() { + sparrow_expressions::ensure_registered(); + let struct_type = DataType::Struct( vec![ Field::new("x", DataType::Int64, false), diff --git a/crates/sparrow-expr-execution/Cargo.toml b/crates/sparrow-expr-execution/Cargo.toml new file mode 100644 index 000000000..be2dcd0cd --- /dev/null +++ b/crates/sparrow-expr-execution/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "sparrow-expr-execution" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true +publish = false +description = """ +Execution of expressions over record batches. +""" + +[dependencies] +arrow-array.workspace = true +arrow-schema.workspace = true +error-stack.workspace = true +index_vec.workspace = true +sparrow-batch = { path = "../sparrow-batch" } +sparrow-interfaces = { path = "../sparrow-interfaces" } +sparrow-physical = { path = "../sparrow-physical" } + +[dev-dependencies] +approx.workspace = true +sparrow-expressions = { path = "../sparrow-expressions" } + +[lib] +doctest = false diff --git a/crates/sparrow-expr-execution/src/lib.rs b/crates/sparrow-expr-execution/src/lib.rs new file mode 100644 index 000000000..7536a5db3 --- /dev/null +++ b/crates/sparrow-expr-execution/src/lib.rs @@ -0,0 +1,103 @@ +#![warn( + rust_2018_idioms, + nonstandard_style, + future_incompatible, + clippy::mod_module_files, + clippy::print_stdout, + clippy::print_stderr, + clippy::undocumented_unsafe_blocks +)] + +//! Execution of expressions. +//! +//! Creating an [ExpressionExecutor] takes a sequence of +//! [expressions][sparrow_physical::Expr] and applies them to an input batch to +//! produce a sequence of computed columns. Expressions may reference columns +//! from the input to the expressions (typically the input to the step executing +//! the expressions), create a column from a literal value, or apply an +//! expression to columns computed by earlier expressions. + +use arrow_array::ArrayRef; +use arrow_schema::DataType; +use index_vec::IndexVec; +use sparrow_batch::Batch; + +use sparrow_interfaces::expression::{Error, Evaluator, StaticArg, StaticInfo, WorkArea}; + +/// Executes the expressions within an operation. +/// +/// Each operation produces a stream of inputs (`BoxedInputBatch`). +/// Expressions create columns from the input and by evaluating instructions +/// against existing columns. +pub struct ExpressionExecutor { + evaluators: Vec>, + output_type: DataType, +} + +impl ExpressionExecutor { + /// Create an `ExpressionExecutor` for the given expressions. + pub fn try_new(exprs: &[sparrow_physical::Expr]) -> error_stack::Result { + let evaluators = create_evaluators(exprs)?; + let output_type = exprs + .last() + .expect("at least one expression") + .result_type + .clone(); + Ok(Self { + evaluators, + output_type, + }) + } + + /// Execute the expressions on the given input batch. + /// + /// The result is a vector containing the results of each expression. + pub fn execute(&self, input: &Batch) -> error_stack::Result { + let mut work_area = WorkArea::with_capacity(input, self.evaluators.len()); + for evaluator in self.evaluators.iter() { + let output = evaluator.evaluate(&work_area)?; + work_area.expressions.push(output); + } + Ok(work_area.expressions.pop().unwrap()) + } + + pub fn output_type(&self) -> &DataType { + &self.output_type + } +} + +/// Create the evaluators for the given expressions. +fn create_evaluators( + exprs: &[sparrow_physical::Expr], +) -> error_stack::Result>, Error> { + // Static information (index in expressions, type, etc.) for each expression in `exprs`. + // This is used to locate the information about arguments to the remaining expressions. + // + // It is only needed while instantiating the evaluators. + let mut expressions = IndexVec::with_capacity(exprs.len()); + let mut evaluators = Vec::with_capacity(exprs.len()); + for (index, expr) in exprs.iter().enumerate() { + let args = expr.args.iter().map(|index| &expressions[*index]).collect(); + let info = StaticInfo { + name: &expr.name, + literal_args: &expr.literal_args, + args, + result_type: &expr.result_type, + }; + + evaluators.push(sparrow_interfaces::expression::create_evaluator(info)?); + expressions.push(StaticArg { + index, + data_type: &expr.result_type, + }); + } + Ok(evaluators) +} + +#[cfg(test)] +mod tests { + #[test] + fn test_expressions_registered() { + sparrow_expressions::ensure_registered(); + } +} diff --git a/crates/sparrow-expressions/Cargo.toml b/crates/sparrow-expressions/Cargo.toml index 93438188c..8a6bf470c 100644 --- a/crates/sparrow-expressions/Cargo.toml +++ b/crates/sparrow-expressions/Cargo.toml @@ -6,7 +6,7 @@ edition.workspace = true license.workspace = true publish = false description = """ -Execution of expressions over record batches. +Implementation of stateless expressions over batches. """ [dependencies] @@ -19,18 +19,13 @@ arrow-ord.workspace = true arrow-schema.workspace = true arrow-select.workspace = true arrow-string.workspace = true -derive_more.workspace = true error-stack.workspace = true -hashbrown.workspace = true -index_vec.workspace = true inventory.workspace = true itertools.workspace = true num.workspace = true serde_json.workspace = true sparrow-arrow = { path = "../sparrow-arrow" } -sparrow-batch = { path = "../sparrow-batch" } -sparrow-physical = { path = "../sparrow-physical" } -static_init.workspace = true +sparrow-interfaces = { path = "../sparrow-interfaces" } substring.workspace = true [dev-dependencies] diff --git a/crates/sparrow-expressions/src/evaluators/cast.rs b/crates/sparrow-expressions/src/cast.rs similarity index 94% rename from crates/sparrow-expressions/src/evaluators/cast.rs rename to crates/sparrow-expressions/src/cast.rs index d4dcdbb90..6ded5ab75 100644 --- a/crates/sparrow-expressions/src/evaluators/cast.rs +++ b/crates/sparrow-expressions/src/cast.rs @@ -7,12 +7,10 @@ use error_stack::{IntoReport, IntoReportCompat, ResultExt}; use sparrow_arrow::downcast::downcast_primitive_array; use sparrow_arrow::utils::make_null_array; -use crate::evaluator::Evaluator; -use crate::evaluators::time::i64_to_two_i32; -use crate::evaluators::{EvaluatorFactory, StaticInfo}; -use crate::values::ArrayRefValue; -use crate::work_area::WorkArea; -use crate::Error; +use crate::time::i64_to_two_i32; +use sparrow_interfaces::expression::{ + ArrayRefValue, Error, Evaluator, EvaluatorFactory, StaticInfo, WorkArea, +}; inventory::submit!(EvaluatorFactory { name: "cast", diff --git a/crates/sparrow-expressions/src/evaluators/coalesce.rs b/crates/sparrow-expressions/src/coalesce.rs similarity index 92% rename from crates/sparrow-expressions/src/evaluators/coalesce.rs rename to crates/sparrow-expressions/src/coalesce.rs index 3eb539ba5..5d202c102 100644 --- a/crates/sparrow-expressions/src/evaluators/coalesce.rs +++ b/crates/sparrow-expressions/src/coalesce.rs @@ -1,12 +1,9 @@ use arrow_array::{ArrayRef, BooleanArray}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::ArrayRefValue; -use crate::Error; +use sparrow_interfaces::expression::{ArrayRefValue, Error, Evaluator, StaticInfo}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "coalesce", create: &create }); @@ -19,7 +16,7 @@ struct CoalesceEvaluator { impl Evaluator for CoalesceEvaluator { fn evaluate( &self, - work_area: &crate::work_area::WorkArea<'_>, + work_area: &sparrow_interfaces::expression::WorkArea<'_>, ) -> error_stack::Result { let mut inputs = self.inputs.iter().copied(); diff --git a/crates/sparrow-expressions/src/evaluators/comparison.rs b/crates/sparrow-expressions/src/comparison.rs similarity index 100% rename from crates/sparrow-expressions/src/evaluators/comparison.rs rename to crates/sparrow-expressions/src/comparison.rs diff --git a/crates/sparrow-expressions/src/evaluators/comparison/gt_primitive.rs b/crates/sparrow-expressions/src/comparison/gt_primitive.rs similarity index 80% rename from crates/sparrow-expressions/src/evaluators/comparison/gt_primitive.rs rename to crates/sparrow-expressions/src/comparison/gt_primitive.rs index 0dfe331ec..c157d9f3a 100644 --- a/crates/sparrow-expressions/src/evaluators/comparison/gt_primitive.rs +++ b/crates/sparrow-expressions/src/comparison/gt_primitive.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "gt_primitive", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, ordered) + create: &crate::macros::create_primitive_evaluator!(0, create, ordered) }); /// Evaluator for the `gt_primitive` (greater than) instruction. diff --git a/crates/sparrow-expressions/src/evaluators/comparison/gte_primitive.rs b/crates/sparrow-expressions/src/comparison/gte_primitive.rs similarity index 81% rename from crates/sparrow-expressions/src/evaluators/comparison/gte_primitive.rs rename to crates/sparrow-expressions/src/comparison/gte_primitive.rs index 9d4dbfbdd..4b8167b7e 100644 --- a/crates/sparrow-expressions/src/evaluators/comparison/gte_primitive.rs +++ b/crates/sparrow-expressions/src/comparison/gte_primitive.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "gte_primitive", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, ordered) + create: &crate::macros::create_primitive_evaluator!(0, create, ordered) }); /// Evaluator for the `gte_primitive` (greater than or equal) instruction. diff --git a/crates/sparrow-expressions/src/evaluators/comparison/lt_primitive.rs b/crates/sparrow-expressions/src/comparison/lt_primitive.rs similarity index 80% rename from crates/sparrow-expressions/src/evaluators/comparison/lt_primitive.rs rename to crates/sparrow-expressions/src/comparison/lt_primitive.rs index c417a487f..5fa9784bb 100644 --- a/crates/sparrow-expressions/src/evaluators/comparison/lt_primitive.rs +++ b/crates/sparrow-expressions/src/comparison/lt_primitive.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "lt_primitive", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, ordered) + create: &crate::macros::create_primitive_evaluator!(0, create, ordered) }); /// Evaluator for the `lt_primitive` (less than) instruction. diff --git a/crates/sparrow-expressions/src/evaluators/comparison/lte_primitive.rs b/crates/sparrow-expressions/src/comparison/lte_primitive.rs similarity index 81% rename from crates/sparrow-expressions/src/evaluators/comparison/lte_primitive.rs rename to crates/sparrow-expressions/src/comparison/lte_primitive.rs index 1a7165745..441403bbe 100644 --- a/crates/sparrow-expressions/src/evaluators/comparison/lte_primitive.rs +++ b/crates/sparrow-expressions/src/comparison/lte_primitive.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "lte_primitive", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, ordered) + create: &crate::macros::create_primitive_evaluator!(0, create, ordered) }); /// Evaluator for the `lte_primitive` (less than or equal) instruction. diff --git a/crates/sparrow-expressions/src/evaluator.rs b/crates/sparrow-expressions/src/evaluator.rs deleted file mode 100644 index 1dec9df79..000000000 --- a/crates/sparrow-expressions/src/evaluator.rs +++ /dev/null @@ -1,10 +0,0 @@ -use arrow_array::ArrayRef; - -use crate::work_area::WorkArea; -use crate::Error; - -/// Trait for evaluating an individual expression node. -pub(super) trait Evaluator: Send + Sync { - /// Evaluate the function with the given runtime info. - fn evaluate(&self, work_area: &WorkArea<'_>) -> error_stack::Result; -} diff --git a/crates/sparrow-expressions/src/evaluators.rs b/crates/sparrow-expressions/src/evaluators.rs deleted file mode 100644 index e14e16c03..000000000 --- a/crates/sparrow-expressions/src/evaluators.rs +++ /dev/null @@ -1,233 +0,0 @@ -use std::borrow::Cow; - -use arrow_array::types::ArrowPrimitiveType; -use arrow_schema::DataType; -use hashbrown::HashMap; -use index_vec::IndexVec; -use itertools::Itertools; -use sparrow_arrow::scalar_value::ScalarValue; -use sparrow_physical::Expr; - -use crate::evaluator::Evaluator; -use crate::values::{ArrayRefValue, BooleanValue, PrimitiveValue, StringValue, StructValue}; -use crate::Error; - -mod cast; -mod coalesce; -mod comparison; -mod fieldref; -mod hash; -mod input; -mod is_valid; -mod json_field; -mod literal; -mod logical; -mod macros; -mod math; -mod record; -mod string; -mod time; - -/// Type alias for a function used to create an [Evaluator]. -/// -/// This type is equivalent to dynamic functions with signatures like: -/// -/// ``` -/// fn f<'a>(info: StaticInfo<'a>) -> error_stack::Result, Error> + Send + Sync -/// ``` -/// -/// This corresponds to the functions each evaluator registers for creating -/// an evaluator from the static information (types, constant arguments, and -/// information about the arguments). -type EvaluatorFactoryFn = - dyn for<'a> Fn(StaticInfo<'a>) -> error_stack::Result, Error> + Send + Sync; - -/// Factory for creating evaluators with a specific name. -struct EvaluatorFactory { - name: &'static str, - create: &'static EvaluatorFactoryFn, -} - -inventory::collect!(EvaluatorFactory); - -/// Static information available when creating an evaluator. -pub struct StaticInfo<'a> { - /// Name of the instruction to be evaluated. - name: &'a Cow<'static, str>, - /// Literal (static) arguments to *this* expression. - literal_args: &'a [ScalarValue], - /// Arguments (dynamic) to *this* expression. - args: Vec<&'a StaticArg<'a>>, - /// Result type this expression should produce. - /// - /// For many instructions, this should be inferred from the arguments. - /// It is part of the plan (a) for simplicity, so a plan may be executed - /// without performing type-checking and (b) because some instructions - /// need to know the result-type in order to execute (eg., cast). - result_type: &'a DataType, -} - -/// Information available when creating evaluators for a query. -pub struct StaticArg<'a> { - /// Expression index of argument. - index: usize, - /// The DataType of the argument. - data_type: &'a DataType, -} - -impl<'a> StaticArg<'a> { - pub fn primitive( - &self, - ) -> error_stack::Result, Error> { - PrimitiveValue::try_new(self.index, self.data_type) - } - - pub fn boolean(&self) -> error_stack::Result { - BooleanValue::try_new(self.index, self.data_type) - } - - pub fn string(&self) -> error_stack::Result { - StringValue::try_new(self.index, self.data_type) - } - - pub fn array_ref(&self) -> ArrayRefValue { - ArrayRefValue::new(self.index) - } - - pub fn struct_(&self) -> error_stack::Result { - StructValue::try_new(self.index, self.data_type) - } -} - -impl<'a> StaticInfo<'a> { - /// Return the scalar value corresponding to the exactly-one literal arguments. - fn literal(&self) -> error_stack::Result<&'a ScalarValue, Error> { - error_stack::ensure!( - self.literal_args.len() == 1, - Error::InvalidLiteralCount { - name: self.name.clone(), - expected: 1, - actual: self.literal_args.len() - } - ); - Ok(&self.literal_args[0]) - } - - /// Return the string value corresponding to the exactly-one literal arguments. - fn literal_string(&self) -> error_stack::Result<&'a str, Error> { - match self.literal()? { - ScalarValue::Utf8(Some(string)) => Ok(string), - ScalarValue::LargeUtf8(Some(string)) => Ok(string), - other => { - error_stack::bail!(Error::InvalidLiteral { - expected: "non-null string", - actual: other.clone() - }) - } - } - } - - fn unpack_argument(mut self) -> error_stack::Result<&'a StaticArg<'a>, Error> { - error_stack::ensure!( - self.args.len() == 1, - Error::InvalidArgumentCount { - name: self.name.clone(), - expected: 1, - actual: self.args.len() - } - ); - Ok(self.args.swap_remove(0)) - } - - fn unpack_arguments>>( - self, - ) -> error_stack::Result { - let actual = self.args.len(); - let mut args = self.args.into_iter(); - match args.next_tuple() { - Some(t) => Ok(t), - None => { - error_stack::bail!(Error::InvalidArgumentCount { - name: self.name.clone(), - expected: T::num_items(), - actual - }); - } - } - } -} - -/// Create the evaluators for the given expressions. -pub(super) fn create_evaluators( - exprs: &[Expr], -) -> error_stack::Result>, Error> { - // Static information (index in expressions, type, etc.) for each expression in `exprs`. - // This is used to locate the information about arguments to the remaining expressions. - // - // It is only needed while instantiating the evaluators. - let mut expressions = IndexVec::with_capacity(exprs.len()); - let mut evaluators = Vec::with_capacity(exprs.len()); - for (index, expr) in exprs.iter().enumerate() { - let args = expr.args.iter().map(|index| &expressions[*index]).collect(); - let info = StaticInfo { - name: &expr.name, - literal_args: &expr.literal_args, - args, - result_type: &expr.result_type, - }; - - evaluators.push(create_evaluator(info)?); - expressions.push(StaticArg { - index, - data_type: &expr.result_type, - }); - } - Ok(evaluators) -} - -// This needs to be marked lazy so it is run after the evaluators -// are submitted to the inventory. -#[static_init::dynamic(lazy)] -static EVALUATORS: HashMap<&'static str, &'static EvaluatorFactoryFn> = { - let result: HashMap<_, _> = inventory::iter::() - .map(|e| (e.name, e.create)) - .collect(); - - debug_assert_eq!( - result.len(), - inventory::iter::().count(), - "Expected every evaluator to appear in evaluator map. Duplicates: {:?}", - inventory::iter::() - .map(|e| e.name) - .duplicates() - .collect::>() - ); - result -}; - -fn create_evaluator(info: StaticInfo<'_>) -> error_stack::Result, Error> { - let Some(create) = EVALUATORS.get(info.name.as_ref()) else { - error_stack::bail!(Error::NoEvaluator(info.name.clone())) - }; - create(info) -} - -/// Use the names of registered evaluators to intern the given name. -/// -/// Returns `None` if no evaluator is registered for the given name. -pub fn intern_name(name: &str) -> Option<&'static str> { - EVALUATORS.get_key_value(name).map(|(k, _)| *k) -} - -// Exposed so we can report "nearest" names. -pub fn names() -> impl Iterator { - EVALUATORS.keys().copied() -} - -#[cfg(test)] -mod tests { - #[test] - fn test_evaluator_registration() { - assert!((*super::EVALUATORS).contains_key("add")) - } -} diff --git a/crates/sparrow-expressions/src/executor.rs b/crates/sparrow-expressions/src/executor.rs deleted file mode 100644 index 67843b651..000000000 --- a/crates/sparrow-expressions/src/executor.rs +++ /dev/null @@ -1,50 +0,0 @@ -use arrow_array::ArrayRef; -use arrow_schema::DataType; -use sparrow_batch::Batch; - -use crate::evaluator::Evaluator; -use crate::evaluators; -use crate::work_area::WorkArea; -use crate::Error; - -/// Executes the expressions within an operation. -/// -/// Each operation produces a stream of inputs (`BoxedInputBatch`). -/// Expressions create columns from the input and by evaluating instructions -/// against existing columns. -pub struct ExpressionExecutor { - evaluators: Vec>, - output_type: DataType, -} - -impl ExpressionExecutor { - /// Create an `ExpressionExecutor` for the given expressions. - pub fn try_new(exprs: &[sparrow_physical::Expr]) -> error_stack::Result { - let evaluators = evaluators::create_evaluators(exprs)?; - let output_type = exprs - .last() - .expect("at least one expression") - .result_type - .clone(); - Ok(Self { - evaluators, - output_type, - }) - } - - /// Execute the expressions on the given input batch. - /// - /// The result is a vector containing the results of each expression. - pub fn execute(&self, input: &Batch) -> error_stack::Result { - let mut work_area = WorkArea::with_capacity(input, self.evaluators.len()); - for evaluator in self.evaluators.iter() { - let output = evaluator.evaluate(&work_area)?; - work_area.expressions.push(output); - } - Ok(work_area.expressions.pop().unwrap()) - } - - pub fn output_type(&self) -> &DataType { - &self.output_type - } -} diff --git a/crates/sparrow-expressions/src/evaluators/fieldref.rs b/crates/sparrow-expressions/src/fieldref.rs similarity index 79% rename from crates/sparrow-expressions/src/evaluators/fieldref.rs rename to crates/sparrow-expressions/src/fieldref.rs index beefa711e..c09dc4cf3 100644 --- a/crates/sparrow-expressions/src/evaluators/fieldref.rs +++ b/crates/sparrow-expressions/src/fieldref.rs @@ -1,10 +1,8 @@ use arrow_schema::DataType; -use crate::evaluator::Evaluator; -use crate::values::StructValue; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, StaticInfo, StructValue, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "fieldref", create: &create }); @@ -18,15 +16,15 @@ struct FieldRefEvaluator { impl Evaluator for FieldRefEvaluator { fn evaluate( &self, - work_area: &crate::work_area::WorkArea<'_>, - ) -> error_stack::Result { + work_area: &WorkArea<'_>, + ) -> error_stack::Result { let input = work_area.expression(self.input); let field = input.column(self.field).clone(); Ok(field) } } -fn create(info: super::StaticInfo<'_>) -> error_stack::Result, crate::Error> { +fn create(info: StaticInfo<'_>) -> error_stack::Result, Error> { let field = info.literal_string()?; let result_type = info.result_type; diff --git a/crates/sparrow-expressions/src/evaluators/hash.rs b/crates/sparrow-expressions/src/hash.rs similarity index 77% rename from crates/sparrow-expressions/src/evaluators/hash.rs rename to crates/sparrow-expressions/src/hash.rs index d49177e70..b34537df3 100644 --- a/crates/sparrow-expressions/src/evaluators/hash.rs +++ b/crates/sparrow-expressions/src/hash.rs @@ -3,13 +3,9 @@ use std::sync::Arc; use arrow_array::ArrayRef; use error_stack::ResultExt; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::ArrayRefValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{ArrayRefValue, Error, Evaluator, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "hash", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/input.rs b/crates/sparrow-expressions/src/input.rs similarity index 57% rename from crates/sparrow-expressions/src/evaluators/input.rs rename to crates/sparrow-expressions/src/input.rs index f9a42606f..b0c349291 100644 --- a/crates/sparrow-expressions/src/evaluators/input.rs +++ b/crates/sparrow-expressions/src/input.rs @@ -1,10 +1,8 @@ use arrow_array::ArrayRef; -use crate::evaluator::Evaluator; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "input", create: &create, }); @@ -18,6 +16,8 @@ impl Evaluator for InputEvaluator { } } -fn create(_info: super::StaticInfo<'_>) -> error_stack::Result, Error> { +fn create( + _info: sparrow_interfaces::expression::StaticInfo<'_>, +) -> error_stack::Result, Error> { Ok(Box::new(InputEvaluator)) } diff --git a/crates/sparrow-expressions/src/evaluators/is_valid.rs b/crates/sparrow-expressions/src/is_valid.rs similarity index 79% rename from crates/sparrow-expressions/src/evaluators/is_valid.rs rename to crates/sparrow-expressions/src/is_valid.rs index e7dcf182c..68f996d80 100644 --- a/crates/sparrow-expressions/src/evaluators/is_valid.rs +++ b/crates/sparrow-expressions/src/is_valid.rs @@ -3,13 +3,9 @@ use std::sync::Arc; use arrow_array::ArrayRef; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::ArrayRefValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{ArrayRefValue, Error, Evaluator, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "is_valid", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/json_field.rs b/crates/sparrow-expressions/src/json_field.rs similarity index 91% rename from crates/sparrow-expressions/src/evaluators/json_field.rs rename to crates/sparrow-expressions/src/json_field.rs index e198900d2..86aa1fb67 100644 --- a/crates/sparrow-expressions/src/evaluators/json_field.rs +++ b/crates/sparrow-expressions/src/json_field.rs @@ -2,10 +2,9 @@ use std::sync::Arc; use arrow_array::StringArray; -use crate::evaluator::Evaluator; -use crate::values::StringValue; +use sparrow_interfaces::expression::{Error, Evaluator, StringValue, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "json_field", create: &create }); @@ -24,15 +23,17 @@ struct JsonFieldEvaluator { impl Evaluator for JsonFieldEvaluator { fn evaluate( &self, - work_area: &crate::work_area::WorkArea<'_>, - ) -> error_stack::Result { + work_area: &WorkArea<'_>, + ) -> error_stack::Result { let input = work_area.expression(self.input); let result = json_field(input, &self.field_name); Ok(Arc::new(result)) } } -fn create(info: super::StaticInfo<'_>) -> error_stack::Result, crate::Error> { +fn create( + info: sparrow_interfaces::expression::StaticInfo<'_>, +) -> error_stack::Result, Error> { let field_name = info.literal_string()?.to_owned(); let input = info.unpack_argument()?; diff --git a/crates/sparrow-expressions/src/lib.rs b/crates/sparrow-expressions/src/lib.rs index 76ac3bf85..101f30d9a 100644 --- a/crates/sparrow-expressions/src/lib.rs +++ b/crates/sparrow-expressions/src/lib.rs @@ -8,90 +8,31 @@ clippy::undocumented_unsafe_blocks )] -//! Execution of expressions. -//! -//! Creating an [ExpressionExecutor] takes a sequence of -//! [expressions][sparrow_physical::Expr] and applies them to an input batch to -//! produce a sequence of computed columns. Expressions may reference columns -//! from the input to the expressions (typically the input to the step executing -//! the expressions), create a column from a literal value, or apply an -//! expression to columns computed by earlier expressions. -//! -//! # Adding an expression -//! -//! Adding an evaluator for a new kind of expression requires the following steps -//! (more details follow): -//! -//! 1. In a file for the evaluator, create a struct implementing the evaluator. -//! This should have fields for the information necessary to execute the -//! expression. -//! 2. Implement [evaluator::Evaluator] for the evaluator struct. -//! 3. Write a method `create` which takes [evaluators::StaticInfo] and returns -//! an `error_stack::Result, Error>`. -//! 4. Register the creation method with inventory of supported expressions. -//! -//! ## Fields in the Evaluator -//! -//! For each input column the evaluator operates on, it should have a -//! [work_area::WorkAreaValue]. This represents a type-safe handle for accessing -//! the input column from the work area during runtime. These values are -//! obtained from the `StaticInfo` by calling methods such as -//! [evaluators::StaticArg::primitive]. -//! -//! Depending on the evaluator, it may also be necessary to have a primitive type -//! parameter, as can be seen with the various math operations. -//! -//! ## Implementing Create -//! -//! The create method should take the static info, verify that the types are -//! correct and the expression can be executed, and then create the evaluator -//! struct. Doing so also requires populating the values that will be referenced -//! at runtime. Usually, this population does the necessary type checking. -//! -//! ## Register the creation method -//! -//! Registering the evaluator is done using the following block of code. -//! This makes use of the `inventory` crate so evaluators may be registered -//! in other modules and discovered automatically. -//! -//! ``` -//! inventory::submit!(crate::evaluators::EvaluatorFactory { -//! name: "not", -//! create: &create -//! }); -//! ``` -//! -//! In cases where the `create` method has a generic parameter, the -//! `create_primitive_evaluator!` macro can be used to create an implementation -//! of `create` for the EvaluatorFactory that matches on the given data type. In -//! the following registration, the first argument (position `0`) is used to -//! determine the type of evaluator to create. The *actual* `create` method -//! is called with the necessary data type. -//! -//! The final argument indicates what types should be supported by the match. -//! It can be one or more pairs of the form `(ty_case, ty_name), ...`, but is -//! more typically one of the following defined types (corresponding to the -//! equivalent Fenl type classes): -//! -//! - `number`: Supports `i32`, `i64`, `u32`, `u64`, `f32` and `f64`. -//! - `signed`: Supports `i32`, `i64`, `f32` and `f64`. -//! - `float`: Supports `f32` and `f64`. -//! - `ordered`: Supports `number` and the timestamp types. -//! -//! ``` -//! inventory::submit!(crate::evaluators::EvaluatorFactory { -//! name: "add", -//! create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, number) -//! }); -//! ``` +//! Stateless expression evaluators. -mod error; -mod evaluator; -mod evaluators; -mod executor; -mod values; -mod work_area; +mod cast; +mod coalesce; +mod comparison; +mod fieldref; +mod hash; +mod input; +mod is_valid; +mod json_field; +mod literal; +mod logical; +mod macros; +mod math; +mod record; +mod string; +mod time; -pub use error::*; -pub use evaluators::{intern_name, names}; -pub use executor::*; +/// Call to ensure the expressions in this crate are installed. +/// +/// This does two things: +/// +/// 1. Avoids the compiler (or linter) removing the dependency on this crate +/// since it is (otherwise) only used via `inventory`. +/// 2. Check early to verify things are present. +pub fn ensure_registered() { + assert!(sparrow_interfaces::expression::intern_name("add").is_some()); +} diff --git a/crates/sparrow-expressions/src/evaluators/literal.rs b/crates/sparrow-expressions/src/literal.rs similarity index 72% rename from crates/sparrow-expressions/src/evaluators/literal.rs rename to crates/sparrow-expressions/src/literal.rs index 54f9657c3..7cf9526fe 100644 --- a/crates/sparrow-expressions/src/evaluators/literal.rs +++ b/crates/sparrow-expressions/src/literal.rs @@ -1,11 +1,9 @@ use arrow_array::ArrayRef; use sparrow_arrow::scalar_value::ScalarValue; -use crate::evaluator::Evaluator; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "literal", create: &create }); @@ -21,7 +19,9 @@ impl Evaluator for LiteralEvaluator { } } -fn create(info: super::StaticInfo<'_>) -> error_stack::Result, Error> { +fn create( + info: sparrow_interfaces::expression::StaticInfo<'_>, +) -> error_stack::Result, Error> { let scalar = info.literal()?.clone(); error_stack::ensure!( &scalar.data_type() == info.result_type, diff --git a/crates/sparrow-expressions/src/evaluators/logical.rs b/crates/sparrow-expressions/src/logical.rs similarity index 100% rename from crates/sparrow-expressions/src/evaluators/logical.rs rename to crates/sparrow-expressions/src/logical.rs diff --git a/crates/sparrow-expressions/src/evaluators/logical/if_.rs b/crates/sparrow-expressions/src/logical/if_.rs similarity index 94% rename from crates/sparrow-expressions/src/evaluators/logical/if_.rs rename to crates/sparrow-expressions/src/logical/if_.rs index 2f89f62be..ececdf511 100644 --- a/crates/sparrow-expressions/src/evaluators/logical/if_.rs +++ b/crates/sparrow-expressions/src/logical/if_.rs @@ -5,13 +5,11 @@ use arrow_data::ArrayData; use arrow_schema::DataType; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::{ArrayRefValue, BooleanValue}; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{ + ArrayRefValue, BooleanValue, Error, Evaluator, StaticInfo, WorkArea, +}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "if", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/logical/logical_and.rs b/crates/sparrow-expressions/src/logical/logical_and.rs similarity index 83% rename from crates/sparrow-expressions/src/evaluators/logical/logical_and.rs rename to crates/sparrow-expressions/src/logical/logical_and.rs index 72c80f3a6..f8e19f840 100644 --- a/crates/sparrow-expressions/src/evaluators/logical/logical_and.rs +++ b/crates/sparrow-expressions/src/logical/logical_and.rs @@ -3,13 +3,9 @@ use std::sync::Arc; use arrow_array::ArrayRef; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::BooleanValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{BooleanValue, Error, Evaluator, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "logical_and", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/logical/logical_or.rs b/crates/sparrow-expressions/src/logical/logical_or.rs similarity index 82% rename from crates/sparrow-expressions/src/evaluators/logical/logical_or.rs rename to crates/sparrow-expressions/src/logical/logical_or.rs index 0b0044bbb..7d975b8ee 100644 --- a/crates/sparrow-expressions/src/evaluators/logical/logical_or.rs +++ b/crates/sparrow-expressions/src/logical/logical_or.rs @@ -3,13 +3,9 @@ use std::sync::Arc; use arrow_array::ArrayRef; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::BooleanValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{BooleanValue, Error, Evaluator, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "logical_or", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/logical/not.rs b/crates/sparrow-expressions/src/logical/not.rs similarity index 78% rename from crates/sparrow-expressions/src/evaluators/logical/not.rs rename to crates/sparrow-expressions/src/logical/not.rs index b1a88cdbb..50ed1e26a 100644 --- a/crates/sparrow-expressions/src/evaluators/logical/not.rs +++ b/crates/sparrow-expressions/src/logical/not.rs @@ -3,13 +3,9 @@ use std::sync::Arc; use arrow_array::ArrayRef; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::BooleanValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{BooleanValue, Error, Evaluator, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "not", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/logical/null_if.rs b/crates/sparrow-expressions/src/logical/null_if.rs similarity index 94% rename from crates/sparrow-expressions/src/evaluators/logical/null_if.rs rename to crates/sparrow-expressions/src/logical/null_if.rs index 918376017..3e39b6247 100644 --- a/crates/sparrow-expressions/src/evaluators/logical/null_if.rs +++ b/crates/sparrow-expressions/src/logical/null_if.rs @@ -6,13 +6,11 @@ use arrow_data::ArrayData; use arrow_schema::DataType; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::{ArrayRefValue, BooleanValue}; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{ + ArrayRefValue, BooleanValue, Error, Evaluator, StaticInfo, WorkArea, +}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "null_if", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/macros.rs b/crates/sparrow-expressions/src/macros.rs similarity index 86% rename from crates/sparrow-expressions/src/evaluators/macros.rs rename to crates/sparrow-expressions/src/macros.rs index 108444637..db0a9ba14 100644 --- a/crates/sparrow-expressions/src/evaluators/macros.rs +++ b/crates/sparrow-expressions/src/macros.rs @@ -17,7 +17,7 @@ /// by `PrimitiveArray`. macro_rules! create_primitive_evaluator { ($type_index:expr, $evaluator:ident, number) => { - crate::evaluators::macros::create_primitive_evaluator! {$type_index, $evaluator, + crate::macros::create_primitive_evaluator! {$type_index, $evaluator, (arrow_schema::DataType::Int32, Int32Type), (arrow_schema::DataType::Int64, Int64Type), (arrow_schema::DataType::UInt32, UInt32Type), @@ -27,7 +27,7 @@ macro_rules! create_primitive_evaluator { } }; ($type_index:expr, $evaluator:ident, signed) => { - crate::evaluators::macros::create_primitive_evaluator! {$type_index, $evaluator, + crate::macros::create_primitive_evaluator! {$type_index, $evaluator, (arrow_schema::DataType::Int32, Int32Type), (arrow_schema::DataType::Int64, Int64Type), (arrow_schema::DataType::Float32, Float32Type), @@ -35,13 +35,13 @@ macro_rules! create_primitive_evaluator { } }; ($type_index:expr, $evaluator:ident, float) => { - crate::evaluators::macros::create_primitive_evaluator! {$type_index, $evaluator, + crate::macros::create_primitive_evaluator! {$type_index, $evaluator, (arrow_schema::DataType::Float32, Float32Type), (arrow_schema::DataType::Float64, Float64Type) } }; ($type_index:expr, $evaluator:ident, ordered) => { - crate::evaluators::macros::create_primitive_evaluator! {$type_index, $evaluator, + crate::macros::create_primitive_evaluator! {$type_index, $evaluator, (arrow_schema::DataType::Int32, Int32Type), (arrow_schema::DataType::Int64, Int64Type), (arrow_schema::DataType::UInt32, UInt32Type), @@ -55,7 +55,7 @@ macro_rules! create_primitive_evaluator { } }; ($type_index:expr, $evaluator:ident, $(($ty_case:pat, $ty_name:ident)),+) => { - |info: crate::evaluators::StaticInfo<'_>| -> error_stack::Result, crate::Error> { + |info: sparrow_interfaces::expression::StaticInfo<'_>| -> error_stack::Result, sparrow_interfaces::expression::Error> { use arrow_array::types::*; match info.args[$type_index].data_type { $($ty_case => $evaluator::<$ty_name>(info),)* diff --git a/crates/sparrow-expressions/src/evaluators/math.rs b/crates/sparrow-expressions/src/math.rs similarity index 100% rename from crates/sparrow-expressions/src/evaluators/math.rs rename to crates/sparrow-expressions/src/math.rs diff --git a/crates/sparrow-expressions/src/evaluators/math/add.rs b/crates/sparrow-expressions/src/math/add.rs similarity index 76% rename from crates/sparrow-expressions/src/evaluators/math/add.rs rename to crates/sparrow-expressions/src/math/add.rs index 44056726a..5971f84ba 100644 --- a/crates/sparrow-expressions/src/evaluators/math/add.rs +++ b/crates/sparrow-expressions/src/math/add.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "add", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, number) + create: &crate::macros::create_primitive_evaluator!(0, create, number) }); /// Evaluator for addition (`add`). diff --git a/crates/sparrow-expressions/src/evaluators/math/ceil.rs b/crates/sparrow-expressions/src/math/ceil.rs similarity index 86% rename from crates/sparrow-expressions/src/evaluators/math/ceil.rs rename to crates/sparrow-expressions/src/math/ceil.rs index b6af9f961..4ed03d1c6 100644 --- a/crates/sparrow-expressions/src/evaluators/math/ceil.rs +++ b/crates/sparrow-expressions/src/math/ceil.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType, PrimitiveArray}; use num::Float; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "ceil", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, float) + create: &crate::macros::create_primitive_evaluator!(0, create, float) }); /// Evaluator for the `ceil` expression. diff --git a/crates/sparrow-expressions/src/evaluators/math/clamp.rs b/crates/sparrow-expressions/src/math/clamp.rs similarity index 95% rename from crates/sparrow-expressions/src/evaluators/math/clamp.rs rename to crates/sparrow-expressions/src/math/clamp.rs index 7d3492c87..b04518134 100644 --- a/crates/sparrow-expressions/src/evaluators/math/clamp.rs +++ b/crates/sparrow-expressions/src/math/clamp.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType, PrimitiveArray}; use itertools::izip; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "clamp", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, ordered) + create: &crate::macros::create_primitive_evaluator!(0, create, ordered) }); /// Evaluator for Round. diff --git a/crates/sparrow-expressions/src/evaluators/math/div.rs b/crates/sparrow-expressions/src/math/div.rs similarity index 76% rename from crates/sparrow-expressions/src/evaluators/math/div.rs rename to crates/sparrow-expressions/src/math/div.rs index 5e15e7b5e..0adae7bd3 100644 --- a/crates/sparrow-expressions/src/evaluators/math/div.rs +++ b/crates/sparrow-expressions/src/math/div.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "div", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, number) + create: &crate::macros::create_primitive_evaluator!(0, create, number) }); /// Evaluator for division (`div`). diff --git a/crates/sparrow-expressions/src/evaluators/math/exp.rs b/crates/sparrow-expressions/src/math/exp.rs similarity index 85% rename from crates/sparrow-expressions/src/evaluators/math/exp.rs rename to crates/sparrow-expressions/src/math/exp.rs index d0d39f19b..dbdb619d1 100644 --- a/crates/sparrow-expressions/src/evaluators/math/exp.rs +++ b/crates/sparrow-expressions/src/math/exp.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType, PrimitiveArray}; use num::Float; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "exp", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, float) + create: &crate::macros::create_primitive_evaluator!(0, create, float) }); /// Evaluator for the `exp` expression. diff --git a/crates/sparrow-expressions/src/evaluators/math/floor.rs b/crates/sparrow-expressions/src/math/floor.rs similarity index 86% rename from crates/sparrow-expressions/src/evaluators/math/floor.rs rename to crates/sparrow-expressions/src/math/floor.rs index 6f9f173c6..8a658d4c3 100644 --- a/crates/sparrow-expressions/src/evaluators/math/floor.rs +++ b/crates/sparrow-expressions/src/math/floor.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType, PrimitiveArray}; use num::Float; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "floor", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, float) + create: &crate::macros::create_primitive_evaluator!(0, create, float) }); /// Evaluator for the `floor` expression. diff --git a/crates/sparrow-expressions/src/evaluators/math/greatest.rs b/crates/sparrow-expressions/src/math/greatest.rs similarity index 90% rename from crates/sparrow-expressions/src/evaluators/math/greatest.rs rename to crates/sparrow-expressions/src/math/greatest.rs index dea2adb71..51b744a6d 100644 --- a/crates/sparrow-expressions/src/evaluators/math/greatest.rs +++ b/crates/sparrow-expressions/src/math/greatest.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNativeTypeOp, ArrowNumericType, PrimitiveArray}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "greatest", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, number) + create: &crate::macros::create_primitive_evaluator!(0, create, number) }); /// Evaluator for the `greatest` (max) instruction. diff --git a/crates/sparrow-expressions/src/evaluators/math/least.rs b/crates/sparrow-expressions/src/math/least.rs similarity index 90% rename from crates/sparrow-expressions/src/evaluators/math/least.rs rename to crates/sparrow-expressions/src/math/least.rs index 7952b12b0..3ac15b972 100644 --- a/crates/sparrow-expressions/src/evaluators/math/least.rs +++ b/crates/sparrow-expressions/src/math/least.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNativeTypeOp, ArrowNumericType, PrimitiveArray}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "least", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, number) + create: &crate::macros::create_primitive_evaluator!(0, create, number) }); /// Evaluator for the `least` (min) instruction. diff --git a/crates/sparrow-expressions/src/evaluators/math/mul.rs b/crates/sparrow-expressions/src/math/mul.rs similarity index 76% rename from crates/sparrow-expressions/src/evaluators/math/mul.rs rename to crates/sparrow-expressions/src/math/mul.rs index 602d9551f..13cf07364 100644 --- a/crates/sparrow-expressions/src/evaluators/math/mul.rs +++ b/crates/sparrow-expressions/src/math/mul.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "mul", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, number) + create: &crate::macros::create_primitive_evaluator!(0, create, number) }); /// Evaluator for multiplication (`mul`). diff --git a/crates/sparrow-expressions/src/evaluators/math/neg.rs b/crates/sparrow-expressions/src/math/neg.rs similarity index 74% rename from crates/sparrow-expressions/src/evaluators/math/neg.rs rename to crates/sparrow-expressions/src/math/neg.rs index 229726ed3..12acd08e4 100644 --- a/crates/sparrow-expressions/src/evaluators/math/neg.rs +++ b/crates/sparrow-expressions/src/math/neg.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "neg", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, signed) + create: &crate::macros::create_primitive_evaluator!(0, create, signed) }); /// Evaluator for unary negation. diff --git a/crates/sparrow-expressions/src/evaluators/math/powf.rs b/crates/sparrow-expressions/src/math/powf.rs similarity index 90% rename from crates/sparrow-expressions/src/evaluators/math/powf.rs rename to crates/sparrow-expressions/src/math/powf.rs index 31434bc0a..40b3ca506 100644 --- a/crates/sparrow-expressions/src/evaluators/math/powf.rs +++ b/crates/sparrow-expressions/src/math/powf.rs @@ -4,15 +4,11 @@ use arrow_array::{ArrayRef, ArrowNumericType, PrimitiveArray}; use error_stack::{IntoReport, ResultExt}; use num::traits::Pow; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "powf", - create: &crate::evaluators::macros::create_primitive_evaluator!( + create: &crate::macros::create_primitive_evaluator!( 0, create, // Currently, the signature for powf forces f64, so we only instantiate that case. diff --git a/crates/sparrow-expressions/src/evaluators/math/round.rs b/crates/sparrow-expressions/src/math/round.rs similarity index 86% rename from crates/sparrow-expressions/src/evaluators/math/round.rs rename to crates/sparrow-expressions/src/math/round.rs index 303a34f29..24660ae8c 100644 --- a/crates/sparrow-expressions/src/evaluators/math/round.rs +++ b/crates/sparrow-expressions/src/math/round.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType, PrimitiveArray}; use num::Float; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "round", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, float) + create: &crate::macros::create_primitive_evaluator!(0, create, float) }); /// Evaluator for the `round` expression. diff --git a/crates/sparrow-expressions/src/evaluators/math/sub.rs b/crates/sparrow-expressions/src/math/sub.rs similarity index 76% rename from crates/sparrow-expressions/src/evaluators/math/sub.rs rename to crates/sparrow-expressions/src/math/sub.rs index 83f87e719..1fd0a8b6a 100644 --- a/crates/sparrow-expressions/src/evaluators/math/sub.rs +++ b/crates/sparrow-expressions/src/math/sub.rs @@ -3,15 +3,11 @@ use std::sync::Arc; use arrow_array::{ArrayRef, ArrowNumericType}; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::PrimitiveValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, PrimitiveValue, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "sub", - create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, number) + create: &crate::macros::create_primitive_evaluator!(0, create, number) }); /// Evaluator for subtraction (`sub`). diff --git a/crates/sparrow-expressions/src/evaluators/record.rs b/crates/sparrow-expressions/src/record.rs similarity index 89% rename from crates/sparrow-expressions/src/evaluators/record.rs rename to crates/sparrow-expressions/src/record.rs index fb78f6deb..1325895ca 100644 --- a/crates/sparrow-expressions/src/evaluators/record.rs +++ b/crates/sparrow-expressions/src/record.rs @@ -4,13 +4,9 @@ use arrow_array::StructArray; use arrow_schema::{DataType, Fields}; use itertools::Itertools; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::ArrayRefValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{ArrayRefValue, Error, Evaluator, StaticInfo, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "record", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/string.rs b/crates/sparrow-expressions/src/string.rs similarity index 100% rename from crates/sparrow-expressions/src/evaluators/string.rs rename to crates/sparrow-expressions/src/string.rs diff --git a/crates/sparrow-expressions/src/evaluators/string/len.rs b/crates/sparrow-expressions/src/string/len.rs similarity index 78% rename from crates/sparrow-expressions/src/evaluators/string/len.rs rename to crates/sparrow-expressions/src/string/len.rs index 81a849ab2..f0beda4bf 100644 --- a/crates/sparrow-expressions/src/evaluators/string/len.rs +++ b/crates/sparrow-expressions/src/string/len.rs @@ -3,13 +3,9 @@ use std::sync::Arc; use arrow_array::ArrayRef; use error_stack::{IntoReport, ResultExt}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::StringValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, StaticInfo, StringValue, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "len", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/string/lower.rs b/crates/sparrow-expressions/src/string/lower.rs similarity index 86% rename from crates/sparrow-expressions/src/evaluators/string/lower.rs rename to crates/sparrow-expressions/src/string/lower.rs index 943543b9c..b7bba27fa 100644 --- a/crates/sparrow-expressions/src/evaluators/string/lower.rs +++ b/crates/sparrow-expressions/src/string/lower.rs @@ -2,13 +2,9 @@ use std::sync::Arc; use arrow_array::{ArrayRef, StringArray}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::StringValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, StaticInfo, StringValue, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "lower", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/string/substring.rs b/crates/sparrow-expressions/src/string/substring.rs similarity index 95% rename from crates/sparrow-expressions/src/evaluators/string/substring.rs rename to crates/sparrow-expressions/src/string/substring.rs index 08aae70da..89e0a7b06 100644 --- a/crates/sparrow-expressions/src/evaluators/string/substring.rs +++ b/crates/sparrow-expressions/src/string/substring.rs @@ -5,13 +5,11 @@ use arrow_array::{ArrayRef, Int64Array, StringArray}; use itertools::izip; use substring::Substring; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::{PrimitiveValue, StringValue}; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{ + Error, Evaluator, PrimitiveValue, StaticInfo, StringValue, WorkArea, +}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "substring", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/string/upper.rs b/crates/sparrow-expressions/src/string/upper.rs similarity index 86% rename from crates/sparrow-expressions/src/evaluators/string/upper.rs rename to crates/sparrow-expressions/src/string/upper.rs index aa793864c..e500a16af 100644 --- a/crates/sparrow-expressions/src/evaluators/string/upper.rs +++ b/crates/sparrow-expressions/src/string/upper.rs @@ -2,13 +2,9 @@ use std::sync::Arc; use arrow_array::{ArrayRef, StringArray}; -use crate::evaluator::Evaluator; -use crate::evaluators::StaticInfo; -use crate::values::StringValue; -use crate::work_area::WorkArea; -use crate::Error; +use sparrow_interfaces::expression::{Error, Evaluator, StaticInfo, StringValue, WorkArea}; -inventory::submit!(crate::evaluators::EvaluatorFactory { +inventory::submit!(sparrow_interfaces::expression::EvaluatorFactory { name: "upper", create: &create }); diff --git a/crates/sparrow-expressions/src/evaluators/time.rs b/crates/sparrow-expressions/src/time.rs similarity index 100% rename from crates/sparrow-expressions/src/evaluators/time.rs rename to crates/sparrow-expressions/src/time.rs diff --git a/crates/sparrow-expressions/src/values.rs b/crates/sparrow-expressions/src/values.rs index 295ecd2e7..d33df4f9a 100644 --- a/crates/sparrow-expressions/src/values.rs +++ b/crates/sparrow-expressions/src/values.rs @@ -5,7 +5,7 @@ use arrow_array::types::ArrowPrimitiveType; use arrow_array::{ArrayRef, BooleanArray, PrimitiveArray, StringArray, StructArray}; use arrow_schema::DataType; -use crate::Error; +use sparrow_interfaces::expression::Error; pub trait WorkAreaValue: std::fmt::Debug + Clone + Copy { type Array<'a>; diff --git a/crates/sparrow-expressions/src/work_area.rs b/crates/sparrow-expressions/src/work_area.rs deleted file mode 100644 index 3c838bd0e..000000000 --- a/crates/sparrow-expressions/src/work_area.rs +++ /dev/null @@ -1,40 +0,0 @@ -use arrow_array::ArrayRef; -use sparrow_batch::Batch; - -use crate::values::WorkAreaValue; - -/// Information about an in-progress batch used for evaluation. -pub(super) struct WorkArea<'a> { - pub input: &'a Batch, - pub expressions: Vec, -} - -impl<'a> WorkArea<'a> { - /// Create a work area for processing the given batch. - /// - /// Arguments: - /// - `input`: The [Batch] to process - /// - `expressions`: The number of expressions processed. - pub fn with_capacity(input: &'a Batch, expressions: usize) -> Self { - assert!(!input.is_empty()); - Self { - input, - expressions: Vec::with_capacity(expressions), - } - } - - /// Return the [ArrayRef] for the given input index. - pub fn input_column(&self) -> &ArrayRef { - self.input.data().expect("non empty") - } - - /// Return the [Value] for the given expression index. - pub fn expression(&self, index: R) -> R::Array<'_> { - index.access(&self.expressions) - } - - /// Return the number of rows in the current work area. - pub fn num_rows(&self) -> usize { - self.input.num_rows() - } -} diff --git a/crates/sparrow-interfaces/Cargo.toml b/crates/sparrow-interfaces/Cargo.toml index dda71821d..0c8f99a04 100644 --- a/crates/sparrow-interfaces/Cargo.toml +++ b/crates/sparrow-interfaces/Cargo.toml @@ -10,11 +10,17 @@ Common interfaces for the Sparrow compilation and runtime. """ [dependencies] +arrow-array.workspace = true arrow-schema.workspace = true derive_more.workspace = true error-stack.workspace = true futures.workspace = true +hashbrown.workspace = true +inventory.workspace = true +itertools.workspace = true +sparrow-arrow = { path = "../sparrow-arrow" } sparrow-batch = { path = "../sparrow-batch" } +static_init.workspace = true [dev-dependencies] diff --git a/crates/sparrow-interfaces/src/expression.rs b/crates/sparrow-interfaces/src/expression.rs new file mode 100644 index 000000000..27eb0d8cc --- /dev/null +++ b/crates/sparrow-interfaces/src/expression.rs @@ -0,0 +1,151 @@ +//! Module for defining expression evaluators. +//! +//! Expression evaluators apply an instruction named in the physical plan to one +//! or more Arrow arrays. +//! +//! # Adding an expression +//! +//! Adding an evaluator for a new kind of expression requires the following steps +//! (more details follow): +//! +//! 1. In a file for the evaluator, create a struct implementing the evaluator. +//! This should have fields for the information necessary to execute the +//! expression. +//! 2. Implement [evaluator::Evaluator] for the evaluator struct. +//! 3. Write a method `create` which takes [evaluators::StaticInfo] and returns +//! an `error_stack::Result, Error>`. +//! 4. Register the creation method with inventory of supported expressions. +//! +//! ## Fields in the Evaluator +//! +//! For each input column the evaluator operates on, it should have a +//! [work_area::WorkAreaValue]. This represents a type-safe handle for accessing +//! the input column from the work area during runtime. These values are +//! obtained from the `StaticInfo` by calling methods such as +//! [evaluators::StaticArg::primitive]. +//! +//! Depending on the evaluator, it may also be necessary to have a primitive type +//! parameter, as can be seen with the various math operations. +//! +//! ## Implementing Create +//! +//! The create method should take the static info, verify that the types are +//! correct and the expression can be executed, and then create the evaluator +//! struct. Doing so also requires populating the values that will be referenced +//! at runtime. Usually, this population does the necessary type checking. +//! +//! ## Register the creation method +//! +//! Registering the evaluator is done using the following block of code. +//! This makes use of the `inventory` crate so evaluators may be registered +//! in other modules and discovered automatically. +//! +//! ``` +//! inventory::submit!(crate::evaluators::EvaluatorFactory { +//! name: "not", +//! create: &create +//! }); +//! ``` +//! +//! In cases where the `create` method has a generic parameter, the +//! `create_primitive_evaluator!` macro can be used to create an implementation +//! of `create` for the EvaluatorFactory that matches on the given data type. In +//! the following registration, the first argument (position `0`) is used to +//! determine the type of evaluator to create. The *actual* `create` method +//! is called with the necessary data type. +//! +//! The final argument indicates what types should be supported by the match. +//! It can be one or more pairs of the form `(ty_case, ty_name), ...`, but is +//! more typically one of the following defined types (corresponding to the +//! equivalent Fenl type classes): +//! +//! - `number`: Supports `i32`, `i64`, `u32`, `u64`, `f32` and `f64`. +//! - `signed`: Supports `i32`, `i64`, `f32` and `f64`. +//! - `float`: Supports `f32` and `f64`. +//! - `ordered`: Supports `number` and the timestamp types. +//! +//! ``` +//! inventory::submit!(crate::evaluators::EvaluatorFactory { +//! name: "add", +//! create: &crate::evaluators::macros::create_primitive_evaluator!(0, create, number) +//! }); +//! ``` + +mod error; +mod static_info; +mod work_area; + +use arrow_array::ArrayRef; +use itertools::Itertools; + +pub use error::*; +use hashbrown::HashMap; +pub use static_info::*; +pub use work_area::*; + +/// Trait for evaluating an individual expression node. +pub trait Evaluator: Send + Sync { + /// Evaluate the function with the given runtime info. + fn evaluate(&self, work_area: &WorkArea<'_>) -> error_stack::Result; +} + +/// Type alias for a function used to create an [Evaluator]. +/// +/// This type is equivalent to dynamic functions with signatures like: +/// +/// ``` +/// fn f<'a>(info: StaticInfo<'a>) -> error_stack::Result, Error> + Send + Sync +/// ``` +/// +/// This corresponds to the functions each evaluator registers for creating +/// an evaluator from the static information (types, constant arguments, and +/// information about the arguments). +type EvaluatorFactoryFn = + dyn for<'a> Fn(StaticInfo<'a>) -> error_stack::Result, Error> + Send + Sync; + +/// Factory for creating evaluators with a specific name. +pub struct EvaluatorFactory { + pub name: &'static str, + pub create: &'static EvaluatorFactoryFn, +} + +inventory::collect!(EvaluatorFactory); + +// This needs to be marked lazy so it is run after the evaluators +// are submitted to the inventory. +#[static_init::dynamic(lazy)] +static EVALUATORS: HashMap<&'static str, &'static EvaluatorFactoryFn> = { + let result: HashMap<_, _> = inventory::iter::() + .map(|e| (e.name, e.create)) + .collect(); + + debug_assert_eq!( + result.len(), + inventory::iter::().count(), + "Expected evaluators to have unique names. Duplicates: {:?}", + inventory::iter::() + .map(|e| e.name) + .duplicates() + .collect::>() + ); + result +}; + +pub fn create_evaluator(info: StaticInfo<'_>) -> error_stack::Result, Error> { + let Some(create) = EVALUATORS.get(info.name.as_ref()) else { + error_stack::bail!(Error::NoEvaluator(info.name.clone())) + }; + create(info) +} + +/// Use the names of registered evaluators to intern the given name. +/// +/// Returns `None` if no evaluator is registered for the given name. +pub fn intern_name(name: &str) -> Option<&'static str> { + EVALUATORS.get_key_value(name).map(|(k, _)| *k) +} + +// Exposed so we can report "nearest" names. +pub fn names() -> impl Iterator { + EVALUATORS.keys().copied() +} diff --git a/crates/sparrow-expressions/src/error.rs b/crates/sparrow-interfaces/src/expression/error.rs similarity index 100% rename from crates/sparrow-expressions/src/error.rs rename to crates/sparrow-interfaces/src/expression/error.rs diff --git a/crates/sparrow-interfaces/src/expression/evaluator.rs b/crates/sparrow-interfaces/src/expression/evaluator.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/crates/sparrow-interfaces/src/expression/evaluator.rs @@ -0,0 +1 @@ + diff --git a/crates/sparrow-interfaces/src/expression/static_info.rs b/crates/sparrow-interfaces/src/expression/static_info.rs new file mode 100644 index 000000000..3d5d162b8 --- /dev/null +++ b/crates/sparrow-interfaces/src/expression/static_info.rs @@ -0,0 +1,118 @@ +use std::borrow::Cow; + +use arrow_array::types::ArrowPrimitiveType; +use arrow_schema::DataType; +use itertools::Itertools; +use sparrow_arrow::scalar_value::ScalarValue; + +use super::error::Error; +use crate::expression::work_area::{ + ArrayRefValue, BooleanValue, PrimitiveValue, StringValue, StructValue, +}; + +/// Static information available when creating an evaluator. +pub struct StaticInfo<'a> { + /// Name of the instruction to be evaluated. + pub name: &'a Cow<'static, str>, + /// Literal (static) arguments to *this* expression. + pub literal_args: &'a [ScalarValue], + /// Arguments (dynamic) to *this* expression. + pub args: Vec<&'a StaticArg<'a>>, + /// Result type this expression should produce. + /// + /// For many instructions, this should be inferred from the arguments. + /// It is part of the plan (a) for simplicity, so a plan may be executed + /// without performing type-checking and (b) because some instructions + /// need to know the result-type in order to execute (eg., cast). + pub result_type: &'a DataType, +} + +/// Information available when creating evaluators for a query. +pub struct StaticArg<'a> { + /// Expression index of argument. + pub index: usize, + /// The DataType of the argument. + pub data_type: &'a DataType, +} + +impl<'a> StaticArg<'a> { + pub fn primitive( + &self, + ) -> error_stack::Result, Error> { + PrimitiveValue::try_new(self.index, self.data_type) + } + + pub fn boolean(&self) -> error_stack::Result { + BooleanValue::try_new(self.index, self.data_type) + } + + pub fn string(&self) -> error_stack::Result { + StringValue::try_new(self.index, self.data_type) + } + + pub fn array_ref(&self) -> ArrayRefValue { + ArrayRefValue::new(self.index) + } + + pub fn struct_(&self) -> error_stack::Result { + StructValue::try_new(self.index, self.data_type) + } +} + +impl<'a> StaticInfo<'a> { + /// Return the scalar value corresponding to the exactly-one literal arguments. + pub fn literal(&self) -> error_stack::Result<&'a ScalarValue, Error> { + error_stack::ensure!( + self.literal_args.len() == 1, + Error::InvalidLiteralCount { + name: self.name.clone(), + expected: 1, + actual: self.literal_args.len() + } + ); + Ok(&self.literal_args[0]) + } + + /// Return the string value corresponding to the exactly-one literal arguments. + pub fn literal_string(&self) -> error_stack::Result<&'a str, Error> { + match self.literal()? { + ScalarValue::Utf8(Some(string)) => Ok(string), + ScalarValue::LargeUtf8(Some(string)) => Ok(string), + other => { + error_stack::bail!(Error::InvalidLiteral { + expected: "non-null string", + actual: other.clone() + }) + } + } + } + + pub fn unpack_argument(mut self) -> error_stack::Result<&'a StaticArg<'a>, Error> { + error_stack::ensure!( + self.args.len() == 1, + Error::InvalidArgumentCount { + name: self.name.clone(), + expected: 1, + actual: self.args.len() + } + ); + Ok(self.args.swap_remove(0)) + } + + pub fn unpack_arguments>>( + self, + ) -> error_stack::Result { + let actual = self.args.len(); + let mut args = self.args.into_iter(); + match args.next_tuple() { + Some(t) => Ok(t), + None => { + error_stack::bail!(Error::InvalidArgumentCount { + name: self.name.clone(), + expected: T::num_items(), + actual + }); + } + } + } +} diff --git a/crates/sparrow-interfaces/src/expression/work_area.rs b/crates/sparrow-interfaces/src/expression/work_area.rs new file mode 100644 index 000000000..83427cdf0 --- /dev/null +++ b/crates/sparrow-interfaces/src/expression/work_area.rs @@ -0,0 +1,196 @@ +use arrow_array::cast::{as_boolean_array, as_primitive_array, as_string_array, as_struct_array}; +use arrow_array::types::ArrowPrimitiveType; +use arrow_array::{ArrayRef, BooleanArray, PrimitiveArray, StringArray, StructArray}; +use arrow_schema::DataType; +use sparrow_batch::Batch; +use std::marker::PhantomData; + +use super::Error; + +/// Information about an in-progress batch used for evaluation. +pub struct WorkArea<'a> { + pub input: &'a Batch, + pub expressions: Vec, +} + +impl<'a> WorkArea<'a> { + /// Create a work area for processing the given batch. + /// + /// Arguments: + /// - `input`: The [Batch] to process + /// - `expressions`: The number of expressions processed. + pub fn with_capacity(input: &'a Batch, expressions: usize) -> Self { + assert!(!input.is_empty()); + Self { + input, + expressions: Vec::with_capacity(expressions), + } + } + + /// Return the [ArrayRef] for the given input index. + pub fn input_column(&self) -> &ArrayRef { + self.input.data().expect("non empty") + } + + /// Return the [Value] for the given expression index. + pub fn expression(&self, index: R) -> R::Array<'_> { + index.access(&self.expressions) + } + + /// Return the number of rows in the current work area. + pub fn num_rows(&self) -> usize { + self.input.num_rows() + } +} + +/// A reference to a value that may be retrieved from the [WorkArea]. +pub trait WorkAreaValue: std::fmt::Debug + Clone + Copy { + type Array<'a>; + fn access<'a>(&self, arrays: &'a [ArrayRef]) -> Self::Array<'a>; +} + +pub struct PrimitiveValue { + index: usize, + /// Use the type parameter and indicate it is invariant. + _phantom: PhantomData T>, +} + +impl PrimitiveValue { + pub fn try_new(index: usize, data_type: &DataType) -> error_stack::Result { + error_stack::ensure!( + &T::DATA_TYPE == data_type, + Error::InvalidArgumentType { + expected: T::DATA_TYPE, + actual: data_type.clone() + } + ); + + Ok(PrimitiveValue { + index, + _phantom: Default::default(), + }) + } +} + +impl std::fmt::Debug for PrimitiveValue { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PrimitiveRef") + .field("index", &self.index) + .finish_non_exhaustive() + } +} + +impl Clone for PrimitiveValue { + fn clone(&self) -> Self { + *self + } +} +impl Copy for PrimitiveValue {} + +impl WorkAreaValue for PrimitiveValue { + type Array<'a> = &'a PrimitiveArray; + + fn access<'a>(&self, arrays: &'a [ArrayRef]) -> Self::Array<'a> { + as_primitive_array(arrays[self.index].as_ref()) + } +} + +#[derive(Debug, Clone, Copy)] +#[repr(transparent)] +pub struct BooleanValue { + index: usize, +} + +impl BooleanValue { + pub fn try_new(index: usize, data_type: &DataType) -> error_stack::Result { + error_stack::ensure!( + data_type == &DataType::Boolean, + Error::InvalidArgumentType { + expected: DataType::Boolean, + actual: data_type.to_owned() + } + ); + Ok(Self { index }) + } +} + +impl WorkAreaValue for BooleanValue { + type Array<'a> = &'a BooleanArray; + + fn access<'a>(&self, arrays: &'a [ArrayRef]) -> Self::Array<'a> { + as_boolean_array(arrays[self.index].as_ref()) + } +} + +#[derive(Debug, Clone, Copy)] +#[repr(transparent)] +pub struct StringValue { + index: usize, +} + +impl StringValue { + pub fn try_new(index: usize, data_type: &DataType) -> error_stack::Result { + error_stack::ensure!( + data_type == &DataType::Utf8, + Error::InvalidArgumentType { + expected: DataType::Utf8, + actual: data_type.to_owned() + } + ); + Ok(Self { index }) + } +} + +impl WorkAreaValue for StringValue { + type Array<'a> = &'a StringArray; + + fn access<'a>(&self, arrays: &'a [ArrayRef]) -> Self::Array<'a> { + as_string_array(arrays[self.index].as_ref()) + } +} + +#[derive(Debug, Clone, Copy)] +#[repr(transparent)] +pub struct StructValue { + index: usize, +} + +impl StructValue { + pub fn try_new(index: usize, data_type: &DataType) -> error_stack::Result { + error_stack::ensure!( + matches!(data_type, DataType::Struct(_)), + Error::InvalidNonStructArgumentType { + actual: data_type.clone() + } + ); + Ok(Self { index }) + } +} + +impl WorkAreaValue for StructValue { + type Array<'a> = &'a StructArray; + + fn access<'a>(&self, arrays: &'a [ArrayRef]) -> Self::Array<'a> { + as_struct_array(arrays[self.index].as_ref()) + } +} + +#[derive(Debug, Clone, Copy)] +#[repr(transparent)] +pub struct ArrayRefValue { + index: usize, +} + +impl ArrayRefValue { + pub fn new(index: usize) -> Self { + Self { index } + } +} + +impl WorkAreaValue for ArrayRefValue { + type Array<'a> = &'a ArrayRef; + + fn access<'a>(&self, arrays: &'a [ArrayRef]) -> Self::Array<'a> { + &arrays[self.index] + } +} diff --git a/crates/sparrow-interfaces/src/lib.rs b/crates/sparrow-interfaces/src/lib.rs index 02e4eb741..7812d85fc 100644 --- a/crates/sparrow-interfaces/src/lib.rs +++ b/crates/sparrow-interfaces/src/lib.rs @@ -8,6 +8,7 @@ )] mod execution_options; +pub mod expression; pub mod source; pub use execution_options::*; diff --git a/crates/sparrow-plan-execution/Cargo.toml b/crates/sparrow-plan-execution/Cargo.toml index 72e609713..ce3632e7c 100644 --- a/crates/sparrow-plan-execution/Cargo.toml +++ b/crates/sparrow-plan-execution/Cargo.toml @@ -32,6 +32,7 @@ arrow-array.workspace = true arrow-schema.workspace = true index_vec.workspace = true sparrow-backend = { path = "../sparrow-backend" } +sparrow-expressions = { path = "../sparrow-expressions" } sparrow-logical = { path = "../sparrow-logical" } sparrow-session = { path = "../sparrow-session" } sparrow-sources = { path = "../sparrow-sources" } diff --git a/crates/sparrow-plan-execution/src/tests.rs b/crates/sparrow-plan-execution/src/tests.rs index 81763ddba..7adafc718 100644 --- a/crates/sparrow-plan-execution/src/tests.rs +++ b/crates/sparrow-plan-execution/src/tests.rs @@ -61,6 +61,8 @@ fn add_input_batch(session: &Session, source: &Arc) { #[test] fn test_logical_query_data_before_execute() { + sparrow_expressions::ensure_registered(); + sparrow_testing::init_test_logging(); let mut session = Session::default(); diff --git a/crates/sparrow-scheduler/Cargo.toml b/crates/sparrow-scheduler/Cargo.toml index 9aabf253a..9c376ff21 100644 --- a/crates/sparrow-scheduler/Cargo.toml +++ b/crates/sparrow-scheduler/Cargo.toml @@ -15,7 +15,6 @@ derive_more.workspace = true error-stack.workspace = true hashbrown.workspace = true index_vec.workspace = true -itertools.workspace = true parking_lot.workspace = true serde.workspace = true smallvec.workspace = true diff --git a/crates/sparrow-session/Cargo.toml b/crates/sparrow-session/Cargo.toml index d6f18778c..aa371324e 100644 --- a/crates/sparrow-session/Cargo.toml +++ b/crates/sparrow-session/Cargo.toml @@ -21,7 +21,6 @@ smallvec.workspace = true sparrow-api = { path = "../sparrow-api" } sparrow-backend = { path = "../sparrow-backend" } sparrow-compiler = { path = "../sparrow-compiler" } -sparrow-expressions = { path = "../sparrow-expressions" } sparrow-instructions = { path = "../sparrow-instructions" } sparrow-interfaces = { path = "../sparrow-interfaces" } sparrow-logical = { path = "../sparrow-logical" } diff --git a/crates/sparrow-session/src/partitioned/session.rs b/crates/sparrow-session/src/partitioned/session.rs index 139cd5a55..518322b1f 100644 --- a/crates/sparrow-session/src/partitioned/session.rs +++ b/crates/sparrow-session/src/partitioned/session.rs @@ -61,14 +61,18 @@ impl Session { "record" => "record", "fieldref" => "fieldref", other => { - // TODO: This *should* use a list of available _logical_ functions. - // For now, we approximate that with the set of available _physical_ functions. - sparrow_expressions::intern_name(other).ok_or_else(|| Error::NoSuchFunction { - name: other.to_owned(), - nearest: NearestMatches::new_nearest_strings( - other, - sparrow_expressions::names().map(|s| s.to_owned()), - ), + // TODO(https://github.com/kaskada-ai/kaskada/issues/818): This + // *should* use a list of available _logical_ functions. For + // now, we approximate that with the set of available _physical_ + // functions. + sparrow_interfaces::expression::intern_name(other).ok_or_else(|| { + Error::NoSuchFunction { + name: other.to_owned(), + nearest: NearestMatches::new_nearest_strings( + other, + sparrow_interfaces::expression::names().map(|s| s.to_owned()), + ), + } })? } }; diff --git a/crates/sparrow-transforms/Cargo.toml b/crates/sparrow-transforms/Cargo.toml index 3b4913553..d75f7da5b 100644 --- a/crates/sparrow-transforms/Cargo.toml +++ b/crates/sparrow-transforms/Cargo.toml @@ -19,13 +19,14 @@ itertools.workspace = true parking_lot.workspace = true sparrow-arrow = { path = "../sparrow-arrow" } sparrow-batch = { path = "../sparrow-batch" } -sparrow-expressions = { path = "../sparrow-expressions" } +sparrow-expr-execution = { path = "../sparrow-expr-execution" } sparrow-physical = { path = "../sparrow-physical" } sparrow-scheduler = { path = "../sparrow-scheduler" } tracing.workspace = true [dev-dependencies] index_vec.workspace = true +sparrow-expressions = { path = "../sparrow-expressions" } [lib] doctest = false diff --git a/crates/sparrow-transforms/src/lib.rs b/crates/sparrow-transforms/src/lib.rs index 5b061fac3..1bda636a6 100644 --- a/crates/sparrow-transforms/src/lib.rs +++ b/crates/sparrow-transforms/src/lib.rs @@ -21,3 +21,11 @@ mod transform; mod transform_pipeline; pub use transform_pipeline::*; + +#[cfg(test)] +mod tests { + #[test] + fn test_ensure_expressions_registered() { + sparrow_expressions::ensure_registered() + } +} diff --git a/crates/sparrow-transforms/src/project.rs b/crates/sparrow-transforms/src/project.rs index 96311e0da..815e41741 100644 --- a/crates/sparrow-transforms/src/project.rs +++ b/crates/sparrow-transforms/src/project.rs @@ -2,7 +2,7 @@ use arrow_schema::DataType; use error_stack::ResultExt; use sparrow_batch::Batch; -use sparrow_expressions::ExpressionExecutor; +use sparrow_expr_execution::ExpressionExecutor; use sparrow_physical::Exprs; use crate::transform::{Error, Transform}; diff --git a/crates/sparrow-transforms/src/select.rs b/crates/sparrow-transforms/src/select.rs index d0fd1a827..d12d5d70c 100644 --- a/crates/sparrow-transforms/src/select.rs +++ b/crates/sparrow-transforms/src/select.rs @@ -3,7 +3,7 @@ use arrow_schema::DataType; use error_stack::{IntoReport, ResultExt}; use sparrow_batch::Batch; -use sparrow_expressions::ExpressionExecutor; +use sparrow_expr_execution::ExpressionExecutor; use sparrow_physical::Exprs; use crate::transform::{Error, Transform}; diff --git a/python/Cargo.lock b/python/Cargo.lock index a3811196d..63fb5b27d 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -3813,7 +3813,7 @@ dependencies = [ "itertools 0.11.0", "smallvec", "sparrow-arrow", - "sparrow-expressions", + "sparrow-interfaces", "sparrow-logical", "sparrow-physical", "static_init", @@ -3887,6 +3887,35 @@ dependencies = [ "tonic", ] +[[package]] +name = "sparrow-expr-execution" +version = "0.11.0" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", + "arrow-string", + "derive_more", + "error-stack", + "hashbrown 0.14.0", + "index_vec", + "inventory", + "itertools 0.11.0", + "num", + "serde_json", + "sparrow-arrow", + "sparrow-batch", + "sparrow-interfaces", + "sparrow-physical", + "static_init", + "substring", +] + [[package]] name = "sparrow-expressions" version = "0.11.0" @@ -3910,6 +3939,7 @@ dependencies = [ "serde_json", "sparrow-arrow", "sparrow-batch", + "sparrow-interfaces", "sparrow-physical", "static_init", "substring", @@ -3956,11 +3986,17 @@ dependencies = [ name = "sparrow-interfaces" version = "0.11.0" dependencies = [ + "arrow-array", "arrow-schema", "derive_more", "error-stack", "futures", + "hashbrown 0.14.0", + "inventory", + "itertools 0.11.0", + "sparrow-arrow", "sparrow-batch", + "static_init", ] [[package]] @@ -4221,7 +4257,7 @@ dependencies = [ "parking_lot 0.12.1", "sparrow-arrow", "sparrow-batch", - "sparrow-expressions", + "sparrow-expr-execution", "sparrow-physical", "sparrow-scheduler", "tracing",