From 441d26d8feeb90c1152d84113762bc5c94c239e0 Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Mon, 21 Aug 2023 12:56:26 -0700 Subject: [PATCH 01/18] pyudf class/trait impl framework: --- crates/sparrow-expressions/src/evaluator.rs | 2 +- crates/sparrow-expressions/src/work_area.rs | 2 +- crates/sparrow-session/src/lib.rs | 2 + crates/sparrow-session/src/session.rs | 2 +- crates/sparrow-syntax/src/syntax/signature.rs | 1 + python/Cargo.lock | 4 + python/Cargo.toml | 2 + python/pysrc/kaskada/_ffi.pyi | 12 +- python/pysrc/kaskada/_timestream.py | 80 +++++++++++++ python/pysrc/kaskada/udf.py | 72 ++++++----- python/pytests/udf_test.py | 49 +++++--- python/src/expr.rs | 62 ++++++++++ python/src/lib.rs | 3 +- python/src/udf.rs | 113 ++++++++++++++---- 14 files changed, 323 insertions(+), 83 deletions(-) diff --git a/crates/sparrow-expressions/src/evaluator.rs b/crates/sparrow-expressions/src/evaluator.rs index 1dec9df79..2e15bade3 100644 --- a/crates/sparrow-expressions/src/evaluator.rs +++ b/crates/sparrow-expressions/src/evaluator.rs @@ -4,7 +4,7 @@ use crate::work_area::WorkArea; use crate::Error; /// Trait for evaluating an individual expression node. -pub(super) trait Evaluator: Send + Sync { +pub trait Evaluator: Send + Sync { /// Evaluate the function with the given runtime info. fn evaluate(&self, work_area: &WorkArea<'_>) -> error_stack::Result; } diff --git a/crates/sparrow-expressions/src/work_area.rs b/crates/sparrow-expressions/src/work_area.rs index 444b614cb..8ab9f923e 100644 --- a/crates/sparrow-expressions/src/work_area.rs +++ b/crates/sparrow-expressions/src/work_area.rs @@ -4,7 +4,7 @@ use sparrow_arrow::Batch; use crate::values::WorkAreaValue; /// Information about an in-progress batch used for evaluation. -pub(super) struct WorkArea<'a> { +pub struct WorkArea<'a> { pub input: &'a Batch, pub expressions: Vec, } diff --git a/crates/sparrow-session/src/lib.rs b/crates/sparrow-session/src/lib.rs index bbc72adef..22caadd2d 100644 --- a/crates/sparrow-session/src/lib.rs +++ b/crates/sparrow-session/src/lib.rs @@ -9,3 +9,5 @@ pub use execution::Execution; pub use expr::{Expr, Literal}; pub use session::{ExecutionOptions, Session}; pub use table::Table; + +pub type Udf = sparrow_instructions::Udf; diff --git a/crates/sparrow-session/src/session.rs b/crates/sparrow-session/src/session.rs index 367db8418..e5b616342 100644 --- a/crates/sparrow-session/src/session.rs +++ b/crates/sparrow-session/src/session.rs @@ -292,7 +292,7 @@ impl Session { /// This bypasses much of the plumbing of the [ExprOp] required due to our construction /// of the AST. #[allow(unused)] - fn add_udf_to_dfg( + pub fn add_udf_to_dfg( &mut self, udf: Arc, args: Vec, diff --git a/crates/sparrow-syntax/src/syntax/signature.rs b/crates/sparrow-syntax/src/syntax/signature.rs index 7083b5678..c20942388 100644 --- a/crates/sparrow-syntax/src/syntax/signature.rs +++ b/crates/sparrow-syntax/src/syntax/signature.rs @@ -189,6 +189,7 @@ impl Signature { self.name, self.parameters.names().len(), num_args, + me ) } } diff --git a/python/Cargo.lock b/python/Cargo.lock index 689b222f6..5bdff94f2 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1971,9 +1971,13 @@ dependencies = [ "mimalloc", "pyo3", "pyo3-asyncio", + "sparrow-compiler", + "sparrow-instructions", "sparrow-session", + "sparrow-syntax", "tokio", "tracing", + "uuid 1.4.1", ] [[package]] diff --git a/python/Cargo.toml b/python/Cargo.toml index 02c175078..49c250868 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -23,8 +23,10 @@ mimalloc = { version = "0.1.37", default-features = false, features = ["local_dy pyo3 = {version = "0.19.1", features = ["abi3-py38", "extension-module", "generate-import-lib"]} pyo3-asyncio = { version = "0.19.0", features = ["tokio-runtime"] } sparrow-session = { path = "../crates/sparrow-session" } +sparrow-syntax = { path = "../crates/sparrow-syntax" } tokio = { version = "1.27.0", features = ["sync"] } tracing = "0.1.37" +uuid = { version = "1.3.0", features = ["v4"] } [lib] name = "kaskada" diff --git a/python/pysrc/kaskada/_ffi.pyi b/python/pysrc/kaskada/_ffi.pyi index f34f7f254..70ccf94ec 100644 --- a/python/pysrc/kaskada/_ffi.pyi +++ b/python/pysrc/kaskada/_ffi.pyi @@ -1,6 +1,7 @@ +from typing import Callable from typing import List from typing import Optional -from typing import Sequence +from typing import Sequencem import pyarrow as pa @@ -35,8 +36,6 @@ class Expr: def execute(self, options: Optional[ExecutionOptions] = None) -> Execution: ... def grouping(self) -> Optional[str]: ... -def call_udf(udf: Udf, result_type: pa.DataType, *args: pa.Array) -> pa.Array: ... - class Table(Expr): def __init__( self, @@ -52,3 +51,10 @@ class Table(Expr): @property def name(self) -> str: ... def add_pyarrow(self, data: pa.RecordBatch) -> None: ... + +class Udf(object): + def __init__( + self, + result_ty: str, + result_fn: Callable[..., pa.Array] + ) -> None: ... \ No newline at end of file diff --git a/python/pysrc/kaskada/_timestream.py b/python/pysrc/kaskada/_timestream.py index 44c7f915c..58b44ee57 100644 --- a/python/pysrc/kaskada/_timestream.py +++ b/python/pysrc/kaskada/_timestream.py @@ -4,6 +4,8 @@ import sys import warnings +import inspect + from datetime import datetime from datetime import timedelta from typing import Callable @@ -65,6 +67,74 @@ def _literal(value: Literal, session: _ffi.Session) -> Timestream: raise TypeError("Cannot create a literal Timestream from a datetime") return Timestream(_ffi.Expr.literal(session, value)) + @staticmethod + def _test_apply_udf( + signature: inspect.Signature, + *args: Union[Timestream, Literal], + session: Optional[_ffi.Session] = None, + ) -> Timestream: + """ + todo + """ + if session is None: + session = next( + arg._ffi_expr.session() for arg in args if isinstance(arg, Timestream) + ) + + def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: + if isinstance(arg, Timestream): + return arg._ffi_expr + else: + return Timestream._literal(arg, session)._ffi_expr + + ffi_args = [make_arg(arg) for arg in args] + + arg_types = [param.annotation for param in signature.parameters.values()] + result_type = signature.return_annotation + + try: + return Timestream( + _ffi.Expr.udf(session=session, arg_types=arg_types, result_type=result_type, args=ffi_args) + ) + except TypeError as e: + # noqa: DAR401 + raise _augment_error(args, TypeError(str(e))) from e + except ValueError as e: + raise _augment_error(args, ValueError(str(e))) from e + + @staticmethod + def _test_decorator_udf( + signature: inspect.Signature, + *args: Union[Timestream, Literal], + session: Optional[_ffi.Session] = None, + ) -> Timestream: + """ + todo + """ + if session is None: + session = next( + arg._ffi_expr.session() for arg in args if isinstance(arg, Timestream) + ) + + def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: + if isinstance(arg, Timestream): + return arg._ffi_expr + else: + return Timestream._literal(arg, session)._ffi_expr + + try: + return Timestream( + _ffi.Expr.decorator_udf(session=session, args=ffi_args) + ) + except TypeError as e: + # noqa: DAR401 + raise _augment_error(args, TypeError(str(e))) from e + except ValueError as e: + raise _augment_error(args, ValueError(str(e))) from e + + + + @staticmethod def _call( func: str, @@ -199,6 +269,16 @@ def pipe( else: return func(self, *args, **kwargs) +# TODO: How does udf work? We kind of want it like... apply some function on an input? +# But also want like `Timestream.udf(f, args*)` + def udf(callable: Callable, *args: Timestream) -> Timestream: + """ + todo + """ + signature = inspect.signature(callable) + + return Timestream._test_udf(signature, args) + def add(self, rhs: Union[Timestream, Literal]) -> Timestream: """ Create a Timestream adding this and `rhs`. diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index 05cf5e3b8..4cb71642d 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -1,57 +1,53 @@ """Functionality for calling Python UDFs from Kaskada.""" +from __future__ import annotations + import functools from typing import Callable import pandas as pd import pyarrow as pa +from ._timestream import Timestream +from ._timestream import Literal +import _ffi # TODO: Allow functions to return `pd.DataFrame` for struct arrays. FuncType = Callable[..., pd.Series] - -class Udf(object): - """Class wrapping a UDF used in Kaskada.""" - - def __init__(self, name, func: FuncType, signature: str) -> None: - """Create a UDF for a function returning a Pandas series. - - Parameters - ---------- - name : str - Name of the function being wrapped. - - func : FuncType - The callable to wrap. - - signature : str - The Kaskada function signature for this UDF. Will be used to check - types of calls to this function and propagate type information for - the rest of the query. - """ +class Udf: + def __init__(self, signature: str, func: Callable[..., pa.Array]) -> None: + """Create a UDF with the given signature.""" functools.update_wrapper(self, func) - self.name = name - self.func = func - self.signature = signature - - def run_pyarrow(self, result_type: pa.DataType, *args: pa.Array) -> pa.Array: - """Run the function producing the given result type.""" - # TODO: I believe this will return a series for simple arrays, and a - # dataframe for struct arrays. We should explore how this handles - # different types. - pd_args = [arg.to_pandas() for arg in args] - pd_result = self.func(*pd_args) - - if isinstance(pd_result, pd.Series): - return pa.Array.from_pandas(pd_result, type=result_type) - else: - raise TypeError(f"Unsupported result type: {type(pd_result)}") + self._ffi = _ffi.Udf(signature, func) + def __call__(self, *args: Timestream | Literal) -> Timestream: + """Apply the UDF to the given arguments.""" + Timestream._call(self._ffi, *args) -def fenl_udf(name: str, signature: str): +def udf(signature: str): """Decorate a function for use as a Kaskada UDF.""" def decorator(func: FuncType): - return Udf(name, func, signature) + # 1. Convert the `FuncType` to the type expected by Udf. + # This needs to take the PyArrow result type and PyArrow inputs, + # convert them to Pandas, call the function, and convert the result + # back to PyArrow. + func = lambda *args: _converted_func(func, *args) + + # 2. Create the UDF object. + return Udf(signature, func) return decorator + +def _converted_func(func: FuncType, result_type: pa.DataType, *args: pa.Array) -> pa.Array: + """Run the function producing the given result type.""" + # TODO: I believe this will return a series for simple arrays, and a + # dataframe for struct arrays. We should explore how this handles + # different types. + pd_args = [arg.to_pandas() for arg in args] + pd_result = func(*pd_args) + + if isinstance(pd_result, pd.Series): + return pa.Array.from_pandas(pd_result, type=result_type) + else: + raise TypeError(f"Unsupported result type: {type(pd_result)}") \ No newline at end of file diff --git a/python/pytests/udf_test.py b/python/pytests/udf_test.py index c60b1f6ee..7ea34621d 100644 --- a/python/pytests/udf_test.py +++ b/python/pytests/udf_test.py @@ -1,26 +1,43 @@ import pandas as pd import pyarrow as pa -from kaskada._ffi import call_udf -from kaskada.udf import Udf -from kaskada.udf import fenl_udf +import kaskada as kd +import kaskdaa._ffi as _ffi +import pytest - -@fenl_udf("add", "add(x: number, y: number) -> number") +@kd.udf("add(x: number, y: number) -> number") def add(x: pd.Series, y: pd.Series) -> pd.Series: + """Use Pandas to add two numbers.""" return x + y -def test_numeric_udf_pure_python() -> None: - assert isinstance(add, Udf) +def test_docstring() -> None: + assert add.__doc__ == "Use Pandas to add two numbers." + + +def test_udf_instance() -> None: + assert isinstance(add, _ffi.Udf) - x = pa.array([1, 12, 17, 23, 28], type=pa.int8()) - y = pa.array([1, 13, 18, 20, 4], type=pa.int8()) - result = add.run_pyarrow(pa.int8(), x, y) - assert result == pa.array([2, 25, 35, 43, 32], type=pa.int8()) +@pytest.fixture +def source_int64() -> kd.sources.CsvString: + content = "\n".join( + [ + "time,key,m,n", + "1996-12-19T16:39:57,A,5,10", + "1996-12-19T16:39:58,B,24,3", + "1996-12-19T16:39:59,A,17,6", + "1996-12-19T16:40:00,A,,9", + "1996-12-19T16:40:01,A,12,", + "1996-12-19T16:40:02,A,,", + ] + ) + return kd.sources.CsvString(content, time_column_name="time", key_column_name="key") +def test_add_udf_direct(golden, source_int64) -> None: + m = source_int64.col("m") + n = source_int64.col("n") + golden.jsonl(add(m, n)) -def test_numeric_udf_rust() -> None: - x = pa.array([1, 12, 17, 23, 28], type=pa.int8()) - y = pa.array([1, 13, 18, 20, 4], type=pa.int8()) - result = call_udf(add, pa.int8(), x, y) - assert result == pa.array([2, 25, 35, 43, 32], type=pa.int8()) +def test_add_udf_pipe(golden, source_int64) -> None: + m = source_int64.col("m") + n = source_int64.col("n") + golden.jsonl(m.pipe(add, n)) diff --git a/python/src/expr.rs b/python/src/expr.rs index 0acc47464..582f64b5d 100644 --- a/python/src/expr.rs +++ b/python/src/expr.rs @@ -44,6 +44,68 @@ impl Expr { Ok(Self { rust_expr, session }) } + // TODO: FRAZ + // Could you add a udf() method here that the _timestream calls? + // I want to be able to call a python function from the _timestream. + // I want to be able to do like polars: + // I like their `map` and `apply` method distinction -- `map` works on series, + // and `apply` works on individual values. + // + // + // TODO: Here is where you need to create your `Arc`. + // That means you need to + // 1. Infer arg types and result types from python signature + // 2. Infer FENL TYPES conversions (aka rust/data types kinda) + // 3. Create a SIGNATURE (Do arg names matter?) + // 4. Create Python Udf - signature + args? + // + // Signature inspection happens in _timestream. + // So I pass in arg_types and result_type, then construct signature here. + + #[staticmethod] + #[pyo3(signature = (session, arg_types, result_type, args))] + fn udf( + session: Session, + arg_types: Vec<&str>, + result_type: &str, + args: Vec, + ) -> PyResult { + if !args.iter().all(|e| e.session() == session) { + return Err(PyValueError::new_err( + "all arguments must be in the same session", + )); + } + + let mut rust_session = session.rust_session()?; + let args: Vec<_> = args.into_iter().map(|e| e.rust_expr).collect(); + println!("arg_types: {:?}", arg_types); + println!("result_type: {:?}", result_type); + + panic!("ok") + + // The python signature + } + + #[staticmethod] + #[pyo3(signature = (session, args))] + fn decorator_udf( + session: Session, + // TODO: signature? + args: Vec, + ) -> PyResult { + if !args.iter().all(|e| e.session() == session) { + return Err(PyValueError::new_err( + "all arguments must be in the same session", + )); + } + + let mut rust_session = session.rust_session()?; + let args: Vec<_> = args.into_iter().map(|e| e.rust_expr).collect(); + println!("arg_types: {:?}", arg_types); + println!("result_type: {:?}", result_type); + panic!("ok") + } + #[staticmethod] #[pyo3(signature = (session, value))] fn literal(session: Session, value: Option) -> PyResult { diff --git a/python/src/lib.rs b/python/src/lib.rs index 957981d31..814656bf3 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1,5 +1,6 @@ use pyo3::prelude::*; +mod call_udf; mod error; mod execution; mod expr; @@ -16,7 +17,7 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; #[pymodule] #[pyo3(name = "_ffi")] fn ffi(_py: Python<'_>, m: &PyModule) -> PyResult<()> { - m.add_function(wrap_pyfunction!(udf::call_udf, m)?)?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/python/src/udf.rs b/python/src/udf.rs index 5557b4c80..b2854bf97 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -4,29 +4,98 @@ use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::prelude::*; use pyo3::types::PyTuple; -#[pyfunction] -#[pyo3(signature = (udf, result_type, *args))] -pub(super) fn call_udf<'py>( - py: Python<'py>, - udf: &'py PyAny, - result_type: &'py PyAny, - args: &'py PyTuple, -) -> PyResult<&'py PyAny> { - let result_type = DataType::from_pyarrow(result_type)?; - - // 1. Make sure we can convert each input to and from arrow arrays. - let mut udf_args = Vec::with_capacity(args.len() + 1); - udf_args.push(result_type.to_pyarrow(py)?); - for arg in args { - let array_data = ArrayData::from_pyarrow(arg)?; - let py_array: PyObject = array_data.to_pyarrow(py)?; - udf_args.push(py_array); +struct PyUdf { + signature: Signature, + callable: Py, + uuid: uuid::Uuid, +} + +struct PyUdfEvaluator { + args: Vec, + callable: Py, +} + +#[derive(Clone)] +#[pyclass] +pub(crate) struct Udf(Arc); + +impl sparrow_session::Udf for PyUdf { + fn signature(&self) -> &Signature { + &self.signatrue } - let args = PyTuple::new(py, udf_args); - let result = udf.call_method("run_pyarrow", args, None)?; - let array_data: ArrayData = ArrayData::from_pyarrow(result)?; - assert_eq!(array_data.data_type(), &result_type); + // FRAZ: Should we allow this to return an error? + fn make_evaluator(&self, static_info: StaticInfo<'_>) -> Box { + // Idea: Create a random value of each argument type and verify that the callbale + // is invokable on that. This would let us detect errors early. - Ok(result) + // FRAZ: Get the value refs for each static arg. + Box::new(PyUdfEvaluator { + args, + callable: self.callable.clone(), + }) + } + + fn uuid(&self) -> &uuid::Uuid { + &self.uuid + } +} + +impl Evaluator for PyUdfEvaluator { + /// Evaluate the function with the given runtime info. + fn evaluate(&self, work_area: &WorkArea<'_>) -> error_stack::Result { + let result = Python::with_gil(|py| -> PyResult<()> { evaluate_py(py, args) }); + // FRAZ: handle python errors. + Ok(result) + } } + +impl PyUdfEvaluator { + fn evaluate_py<'py>(&self, py: Python<'py>, args: Vec) -> ArrayRef { + // FRAZ: Fill in based on `call_udf` + todo!() + } +} + +#[pymethods] +impl Udf { + #[new] + fn new(signature: String, callable: Py) -> Self { + // FRAZ: Parse the signature and create an instance of PyUdf. + // Then, create an `Arc` from that and pass that to + // Udf. + let udf = PyUdf { + signature: todo!(), + callable, + uuid: todo!(), + }; + Udf(Arc::new(udf)) + } +} + +// #[pyfunction] +// #[pyo3(signature = (udf, result_type, *args))] +// pub(super) fn call_udf<'py>( +// py: Python<'py>, +// udf: &'py PyAny, +// result_type: &'py PyAny, +// args: &'py PyTuple, +// ) -> PyResult<&'py PyAny> { +// let result_type = DataType::from_pyarrow(result_type)?; + +// // 1. Make sure we can convert each input to and from arrow arrays. +// let mut udf_args = Vec::with_capacity(args.len() + 1); +// udf_args.push(result_type.to_pyarrow(py)?); +// for arg in args { +// let array_data = ArrayData::from_pyarrow(arg)?; +// let py_array: PyObject = array_data.to_pyarrow(py)?; +// udf_args.push(py_array); +// } +// let args = PyTuple::new(py, udf_args); +// let result = udf.call_method("run_pyarrow", args, None)?; + +// let array_data: ArrayData = ArrayData::from_pyarrow(result)?; +// assert_eq!(array_data.data_type(), &result_type); + +// Ok(result) +// } From 01b0c188996104dccb2f78e7e4f865dec2525082 Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Tue, 22 Aug 2023 22:15:49 -0700 Subject: [PATCH 02/18] few more plubming changes --- crates/sparrow-expressions/src/evaluator.rs | 2 +- crates/sparrow-expressions/src/work_area.rs | 2 +- crates/sparrow-instructions/src/evaluators.rs | 2 +- crates/sparrow-runtime/src/execute.rs | 1 + .../sparrow-runtime/src/execute/operation.rs | 1 + .../execute/operation/expression_executor.rs | 2 +- crates/sparrow-session/src/lib.rs | 2 +- crates/sparrow-syntax/src/syntax/signature.rs | 1 - python/Cargo.lock | 3 +- python/Cargo.toml | 3 + python/pysrc/kaskada/__init__.py | 6 +- python/pysrc/kaskada/_ffi.pyi | 2 +- python/pysrc/kaskada/_timestream.py | 140 +++++++++--------- python/pysrc/kaskada/udf.py | 3 +- python/pytests/udf_test.py | 7 +- python/src/expr.rs | 62 -------- python/src/lib.rs | 1 - python/src/udf.rs | 113 ++++++++------ 18 files changed, 164 insertions(+), 189 deletions(-) diff --git a/crates/sparrow-expressions/src/evaluator.rs b/crates/sparrow-expressions/src/evaluator.rs index 2e15bade3..1dec9df79 100644 --- a/crates/sparrow-expressions/src/evaluator.rs +++ b/crates/sparrow-expressions/src/evaluator.rs @@ -4,7 +4,7 @@ use crate::work_area::WorkArea; use crate::Error; /// Trait for evaluating an individual expression node. -pub trait Evaluator: Send + Sync { +pub(super) trait Evaluator: Send + Sync { /// Evaluate the function with the given runtime info. fn evaluate(&self, work_area: &WorkArea<'_>) -> error_stack::Result; } diff --git a/crates/sparrow-expressions/src/work_area.rs b/crates/sparrow-expressions/src/work_area.rs index 8ab9f923e..444b614cb 100644 --- a/crates/sparrow-expressions/src/work_area.rs +++ b/crates/sparrow-expressions/src/work_area.rs @@ -4,7 +4,7 @@ use sparrow_arrow::Batch; use crate::values::WorkAreaValue; /// Information about an in-progress batch used for evaluation. -pub struct WorkArea<'a> { +pub(super) struct WorkArea<'a> { pub input: &'a Batch, pub expressions: Vec, } diff --git a/crates/sparrow-instructions/src/evaluators.rs b/crates/sparrow-instructions/src/evaluators.rs index 721f4c0e1..719748aa7 100644 --- a/crates/sparrow-instructions/src/evaluators.rs +++ b/crates/sparrow-instructions/src/evaluators.rs @@ -44,7 +44,7 @@ use time::*; #[derive(Debug)] pub struct StaticInfo<'a> { inst_kind: &'a InstKind, - args: Vec, + pub args: Vec, result_type: &'a DataType, } diff --git a/crates/sparrow-runtime/src/execute.rs b/crates/sparrow-runtime/src/execute.rs index 3b284df57..915057ef1 100644 --- a/crates/sparrow-runtime/src/execute.rs +++ b/crates/sparrow-runtime/src/execute.rs @@ -21,6 +21,7 @@ use crate::execute::output::Destination; use crate::key_hash_inverse::{KeyHashInverse, ThreadSafeKeyHashInverse}; use crate::stores::ObjectStoreRegistry; use crate::RuntimeOptions; +pub use operation::WorkArea; mod checkpoints; mod compute_executor; diff --git a/crates/sparrow-runtime/src/execute/operation.rs b/crates/sparrow-runtime/src/execute/operation.rs index 600cb8abb..f7d09c0c4 100644 --- a/crates/sparrow-runtime/src/execute/operation.rs +++ b/crates/sparrow-runtime/src/execute/operation.rs @@ -66,6 +66,7 @@ use crate::execute::Error; use crate::key_hash_inverse::ThreadSafeKeyHashInverse; use crate::stores::ObjectStoreRegistry; use crate::Batch; +pub use expression_executor::WorkArea; /// Information used while creating operations. /// diff --git a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs index 75d1f4c28..33b6c4944 100644 --- a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs +++ b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs @@ -263,7 +263,7 @@ impl ExpressionExecutor { } /// Wrapper around the columns produced during execution. -struct WorkArea { +pub struct WorkArea { time: ArrayRef, subsort: ArrayRef, key_hash: ArrayRef, diff --git a/crates/sparrow-session/src/lib.rs b/crates/sparrow-session/src/lib.rs index 22caadd2d..a06a88662 100644 --- a/crates/sparrow-session/src/lib.rs +++ b/crates/sparrow-session/src/lib.rs @@ -10,4 +10,4 @@ pub use expr::{Expr, Literal}; pub use session::{ExecutionOptions, Session}; pub use table::Table; -pub type Udf = sparrow_instructions::Udf; +// pub type Udf = dyn sparrow_instructions::Udf; diff --git a/crates/sparrow-syntax/src/syntax/signature.rs b/crates/sparrow-syntax/src/syntax/signature.rs index c20942388..7083b5678 100644 --- a/crates/sparrow-syntax/src/syntax/signature.rs +++ b/crates/sparrow-syntax/src/syntax/signature.rs @@ -189,7 +189,6 @@ impl Signature { self.name, self.parameters.names().len(), num_args, - me ) } } diff --git a/python/Cargo.lock b/python/Cargo.lock index 5bdff94f2..0dae8ea3c 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1963,6 +1963,7 @@ dependencies = [ name = "kaskada" version = "0.6.0-a.1" dependencies = [ + "anyhow", "arrow", "derive_more", "error-stack", @@ -1971,8 +1972,8 @@ dependencies = [ "mimalloc", "pyo3", "pyo3-asyncio", - "sparrow-compiler", "sparrow-instructions", + "sparrow-runtime", "sparrow-session", "sparrow-syntax", "tokio", diff --git a/python/Cargo.toml b/python/Cargo.toml index 49c250868..e681c28d0 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -13,6 +13,7 @@ Python library for building and executing temporal queries. [dependencies] arrow = { version = "43.0.0", features = ["pyarrow"] } +anyhow = { version = "1.0.70", features = ["backtrace"] } derive_more = "0.99.17" error-stack = { version = "0.3.1", features = ["anyhow", "spantrace"] } futures = "0.3.27" @@ -23,6 +24,8 @@ mimalloc = { version = "0.1.37", default-features = false, features = ["local_dy pyo3 = {version = "0.19.1", features = ["abi3-py38", "extension-module", "generate-import-lib"]} pyo3-asyncio = { version = "0.19.0", features = ["tokio-runtime"] } sparrow-session = { path = "../crates/sparrow-session" } +sparrow-instructions = { path = "../crates/sparrow-instructions" } +sparrow-runtime = { path = "../crates/sparrow-runtime" } sparrow-syntax = { path = "../crates/sparrow-syntax" } tokio = { version = "1.27.0", features = ["sync"] } tracing = "0.1.37" diff --git a/python/pysrc/kaskada/__init__.py b/python/pysrc/kaskada/__init__.py index 4534b480e..5c2d73fe5 100644 --- a/python/pysrc/kaskada/__init__.py +++ b/python/pysrc/kaskada/__init__.py @@ -1,4 +1,4 @@ -"""Kaskada query builder and local executon engine.""" +"""Kaskada query builder and local execution engine.""" from __future__ import annotations from . import plot @@ -10,7 +10,8 @@ from ._timestream import Literal from ._timestream import Timestream from ._timestream import record - +from .udf import Udf +from .udf import udf __all__ = [ "ExecutionOptions", @@ -21,5 +22,6 @@ "Result", "sources", "Timestream", + "udf", "windows", ] diff --git a/python/pysrc/kaskada/_ffi.pyi b/python/pysrc/kaskada/_ffi.pyi index 70ccf94ec..d77c3ae58 100644 --- a/python/pysrc/kaskada/_ffi.pyi +++ b/python/pysrc/kaskada/_ffi.pyi @@ -1,7 +1,7 @@ from typing import Callable from typing import List from typing import Optional -from typing import Sequencem +from typing import Sequence import pyarrow as pa diff --git a/python/pysrc/kaskada/_timestream.py b/python/pysrc/kaskada/_timestream.py index 58b44ee57..a074c0bd3 100644 --- a/python/pysrc/kaskada/_timestream.py +++ b/python/pysrc/kaskada/_timestream.py @@ -67,70 +67,70 @@ def _literal(value: Literal, session: _ffi.Session) -> Timestream: raise TypeError("Cannot create a literal Timestream from a datetime") return Timestream(_ffi.Expr.literal(session, value)) - @staticmethod - def _test_apply_udf( - signature: inspect.Signature, - *args: Union[Timestream, Literal], - session: Optional[_ffi.Session] = None, - ) -> Timestream: - """ - todo - """ - if session is None: - session = next( - arg._ffi_expr.session() for arg in args if isinstance(arg, Timestream) - ) - - def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: - if isinstance(arg, Timestream): - return arg._ffi_expr - else: - return Timestream._literal(arg, session)._ffi_expr - - ffi_args = [make_arg(arg) for arg in args] - - arg_types = [param.annotation for param in signature.parameters.values()] - result_type = signature.return_annotation - - try: - return Timestream( - _ffi.Expr.udf(session=session, arg_types=arg_types, result_type=result_type, args=ffi_args) - ) - except TypeError as e: - # noqa: DAR401 - raise _augment_error(args, TypeError(str(e))) from e - except ValueError as e: - raise _augment_error(args, ValueError(str(e))) from e - - @staticmethod - def _test_decorator_udf( - signature: inspect.Signature, - *args: Union[Timestream, Literal], - session: Optional[_ffi.Session] = None, - ) -> Timestream: - """ - todo - """ - if session is None: - session = next( - arg._ffi_expr.session() for arg in args if isinstance(arg, Timestream) - ) - - def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: - if isinstance(arg, Timestream): - return arg._ffi_expr - else: - return Timestream._literal(arg, session)._ffi_expr - - try: - return Timestream( - _ffi.Expr.decorator_udf(session=session, args=ffi_args) - ) - except TypeError as e: - # noqa: DAR401 - raise _augment_error(args, TypeError(str(e))) from e - except ValueError as e: - raise _augment_error(args, ValueError(str(e))) from e + # @staticmethod + # def _test_apply_udf( + # signature: inspect.Signature, + # *args: Union[Timestream, Literal], + # session: Optional[_ffi.Session] = None, + # ) -> Timestream: + # """ + # todo + # """ + # if session is None: + # session = next( + # arg._ffi_expr.session() for arg in args if isinstance(arg, Timestream) + # ) + + # def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: + # if isinstance(arg, Timestream): + # return arg._ffi_expr + # else: + # return Timestream._literal(arg, session)._ffi_expr + + # ffi_args = [make_arg(arg) for arg in args] + + # arg_types = [param.annotation for param in signature.parameters.values()] + # result_type = signature.return_annotation + + # try: + # return Timestream( + # _ffi.Expr.udf(session=session, arg_types=arg_types, result_type=result_type, args=ffi_args) + # ) + # except TypeError as e: + # # noqa: DAR401 + # raise _augment_error(args, TypeError(str(e))) from e + # except ValueError as e: + # raise _augment_error(args, ValueError(str(e))) from e + + # @staticmethod + # def _test_decorator_udf( + # signature: inspect.Signature, + # *args: Union[Timestream, Literal], + # session: Optional[_ffi.Session] = None, + # ) -> Timestream: + # """ + # todo + # """ + # if session is None: + # session = next( + # arg._ffi_expr.session() for arg in args if isinstance(arg, Timestream) + # ) + + # def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: + # if isinstance(arg, Timestream): + # return arg._ffi_expr + # else: + # return Timestream._literal(arg, session)._ffi_expr + + # try: + # return Timestream( + # _ffi.Expr.decorator_udf(session=session, args=ffi_args) + # ) + # except TypeError as e: + # # noqa: DAR401 + # raise _augment_error(args, TypeError(str(e))) from e + # except ValueError as e: + # raise _augment_error(args, ValueError(str(e))) from e @@ -271,13 +271,13 @@ def pipe( # TODO: How does udf work? We kind of want it like... apply some function on an input? # But also want like `Timestream.udf(f, args*)` - def udf(callable: Callable, *args: Timestream) -> Timestream: - """ - todo - """ - signature = inspect.signature(callable) + # def udf(callable: Callable, *args: Timestream) -> Timestream: + # """ + # todo + # """ + # signature = inspect.signature(callable) - return Timestream._test_udf(signature, args) + # return Timestream._test_udf(signature, args) def add(self, rhs: Union[Timestream, Literal]) -> Timestream: """ diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index 4cb71642d..df699bb62 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -9,7 +9,7 @@ from ._timestream import Timestream from ._timestream import Literal -import _ffi +from . import _ffi # TODO: Allow functions to return `pd.DataFrame` for struct arrays. FuncType = Callable[..., pd.Series] @@ -22,6 +22,7 @@ def __init__(self, signature: str, func: Callable[..., pa.Array]) -> None: def __call__(self, *args: Timestream | Literal) -> Timestream: """Apply the UDF to the given arguments.""" + # TODO: Timestream>_udf_call? Timestream._call(self._ffi, *args) def udf(signature: str): diff --git a/python/pytests/udf_test.py b/python/pytests/udf_test.py index 7ea34621d..f58fe906b 100644 --- a/python/pytests/udf_test.py +++ b/python/pytests/udf_test.py @@ -1,10 +1,10 @@ import pandas as pd import pyarrow as pa import kaskada as kd -import kaskdaa._ffi as _ffi +import kaskada._ffi as _ffi import pytest -@kd.udf("add(x: number, y: number) -> number") +@kd.udf("add(x: N, y: N) -> N") def add(x: pd.Series, y: pd.Series) -> pd.Series: """Use Pandas to add two numbers.""" return x + y @@ -15,7 +15,8 @@ def test_docstring() -> None: def test_udf_instance() -> None: - assert isinstance(add, _ffi.Udf) + assert isinstance(add, kd.Udf) + assert isinstance(add._ffi, _ffi.Udf) @pytest.fixture def source_int64() -> kd.sources.CsvString: diff --git a/python/src/expr.rs b/python/src/expr.rs index 582f64b5d..0acc47464 100644 --- a/python/src/expr.rs +++ b/python/src/expr.rs @@ -44,68 +44,6 @@ impl Expr { Ok(Self { rust_expr, session }) } - // TODO: FRAZ - // Could you add a udf() method here that the _timestream calls? - // I want to be able to call a python function from the _timestream. - // I want to be able to do like polars: - // I like their `map` and `apply` method distinction -- `map` works on series, - // and `apply` works on individual values. - // - // - // TODO: Here is where you need to create your `Arc`. - // That means you need to - // 1. Infer arg types and result types from python signature - // 2. Infer FENL TYPES conversions (aka rust/data types kinda) - // 3. Create a SIGNATURE (Do arg names matter?) - // 4. Create Python Udf - signature + args? - // - // Signature inspection happens in _timestream. - // So I pass in arg_types and result_type, then construct signature here. - - #[staticmethod] - #[pyo3(signature = (session, arg_types, result_type, args))] - fn udf( - session: Session, - arg_types: Vec<&str>, - result_type: &str, - args: Vec, - ) -> PyResult { - if !args.iter().all(|e| e.session() == session) { - return Err(PyValueError::new_err( - "all arguments must be in the same session", - )); - } - - let mut rust_session = session.rust_session()?; - let args: Vec<_> = args.into_iter().map(|e| e.rust_expr).collect(); - println!("arg_types: {:?}", arg_types); - println!("result_type: {:?}", result_type); - - panic!("ok") - - // The python signature - } - - #[staticmethod] - #[pyo3(signature = (session, args))] - fn decorator_udf( - session: Session, - // TODO: signature? - args: Vec, - ) -> PyResult { - if !args.iter().all(|e| e.session() == session) { - return Err(PyValueError::new_err( - "all arguments must be in the same session", - )); - } - - let mut rust_session = session.rust_session()?; - let args: Vec<_> = args.into_iter().map(|e| e.rust_expr).collect(); - println!("arg_types: {:?}", arg_types); - println!("result_type: {:?}", result_type); - panic!("ok") - } - #[staticmethod] #[pyo3(signature = (session, value))] fn literal(session: Session, value: Option) -> PyResult { diff --git a/python/src/lib.rs b/python/src/lib.rs index 814656bf3..e18cda982 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1,6 +1,5 @@ use pyo3::prelude::*; -mod call_udf; mod error; mod execution; mod expr; diff --git a/python/src/udf.rs b/python/src/udf.rs index b2854bf97..db9c525de 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -1,9 +1,17 @@ -use arrow::array::ArrayData; +use std::fmt::Debug; +use std::sync::Arc; +use itertools::Itertools; + +use arrow::array::{ArrayData, ArrayRef}; use arrow::datatypes::DataType; use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::prelude::*; use pyo3::types::PyTuple; +use sparrow_instructions::{Evaluator, RuntimeInfo, StaticInfo, ValueRef}; +use sparrow_syntax::{FeatureSetPart, Signature}; +use uuid::Uuid; +#[derive(Debug)] struct PyUdf { signature: Signature, callable: Py, @@ -17,18 +25,21 @@ struct PyUdfEvaluator { #[derive(Clone)] #[pyclass] -pub(crate) struct Udf(Arc); +pub(crate) struct Udf(Arc); -impl sparrow_session::Udf for PyUdf { +impl sparrow_instructions::Udf for PyUdf { fn signature(&self) -> &Signature { - &self.signatrue + &self.signature } // FRAZ: Should we allow this to return an error? - fn make_evaluator(&self, static_info: StaticInfo<'_>) -> Box { + fn make_evaluator(&self, info: StaticInfo<'_>) -> Box { + println!("Making udf evaluator"); // Idea: Create a random value of each argument type and verify that the callbale // is invokable on that. This would let us detect errors early. + let args = info.args.iter().map(|arg| arg.value_ref.clone()).collect(); + // FRAZ: Get the value refs for each static arg. Box::new(PyUdfEvaluator { args, @@ -42,9 +53,17 @@ impl sparrow_session::Udf for PyUdf { } impl Evaluator for PyUdfEvaluator { - /// Evaluate the function with the given runtime info. - fn evaluate(&self, work_area: &WorkArea<'_>) -> error_stack::Result { - let result = Python::with_gil(|py| -> PyResult<()> { evaluate_py(py, args) }); + fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { + let args = self.args + .iter() + .map(|value_ref| -> anyhow::Result<_> { + Ok(info.value(value_ref)?.array_ref()?) + }).try_collect()?; + + let result = Python::with_gil(|py| -> anyhow::Result { + // todo: uhh + Ok(self.evaluate_py(py, args)) + })?; // FRAZ: handle python errors. Ok(result) } @@ -52,8 +71,45 @@ impl Evaluator for PyUdfEvaluator { impl PyUdfEvaluator { fn evaluate_py<'py>(&self, py: Python<'py>, args: Vec) -> ArrayRef { - // FRAZ: Fill in based on `call_udf` - todo!() + println!("Evaluating python"); + + // self.callable.call1(py, (args,)).unwrap().extract(py).unwrap(); + // let args: Vec = args + // .iter() + // .map::, _>(|value_ref| { + // let value: ColumnarValue = info.value(value_ref)?; + // let array_ref: ArrayRef = value.array_ref()?; + // let array_data: ArrayData = aray_ref.to_data(); + // let py_obj: PyObject = array_data.to_pyarrow(py)?; + + // Ok(py_obj) + // }) + // .collect::>()?; + + // let locals = [("args", args)].into_py_dict(py); + // py.run(&self.code, None, Some(&locals))?; + // let result_py: PyObject = locals.get_item("ret").unwrap().into(); + + // let result_array_data: ArrayData = ArrayData::from_pyarrow(result_py.as_ref(py))?; + + // // We can't control the returned type, but we can refuse to accept the "wrong" type. + // ensure!( + // *result_array_data.data_type() == self.result_type, + // "expected Python UDF to return type {:?}, but received {:?}", + // result_array_data.data_type(), + // self.result_type + // ); + + // let result_array_ref: ArrayRef = array::make_array(result_array_data); + + // // This is a "map" operation - each row should reflect the time of the + // // corresponding input row, and have the same length. + // ensure!( + // result_array_ref.len() == info.num_rows(), + // "unexpected result length from Python UDF" + // ); + + panic!("arrayref") } } @@ -61,41 +117,14 @@ impl PyUdfEvaluator { impl Udf { #[new] fn new(signature: String, callable: Py) -> Self { - // FRAZ: Parse the signature and create an instance of PyUdf. - // Then, create an `Arc` from that and pass that to - // Udf. + let uuid = Uuid::new_v4(); + // TODO: sig name string cannot be &'static str + let signature = Signature::try_from_str(FeatureSetPart::Function("udf"), &signature).expect("signature to parse"); let udf = PyUdf { - signature: todo!(), + signature, callable, - uuid: todo!(), + uuid, }; Udf(Arc::new(udf)) } } - -// #[pyfunction] -// #[pyo3(signature = (udf, result_type, *args))] -// pub(super) fn call_udf<'py>( -// py: Python<'py>, -// udf: &'py PyAny, -// result_type: &'py PyAny, -// args: &'py PyTuple, -// ) -> PyResult<&'py PyAny> { -// let result_type = DataType::from_pyarrow(result_type)?; - -// // 1. Make sure we can convert each input to and from arrow arrays. -// let mut udf_args = Vec::with_capacity(args.len() + 1); -// udf_args.push(result_type.to_pyarrow(py)?); -// for arg in args { -// let array_data = ArrayData::from_pyarrow(arg)?; -// let py_array: PyObject = array_data.to_pyarrow(py)?; -// udf_args.push(py_array); -// } -// let args = PyTuple::new(py, udf_args); -// let result = udf.call_method("run_pyarrow", args, None)?; - -// let array_data: ArrayData = ArrayData::from_pyarrow(result)?; -// assert_eq!(array_data.data_type(), &result_type); - -// Ok(result) -// } From e0782a2675e16d461a43dcbeae095aacd9b65cf9 Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Tue, 22 Aug 2023 23:18:31 -0700 Subject: [PATCH 03/18] Add call_udf method to expr --- crates/sparrow-session/src/session.rs | 5 +- python/pysrc/kaskada/_ffi.pyi | 7 ++ python/pysrc/kaskada/_timestream.py | 114 ++++++++++++-------------- python/pysrc/kaskada/udf.py | 2 +- python/src/expr.rs | 29 +++++++ python/src/udf.rs | 4 +- 6 files changed, 93 insertions(+), 68 deletions(-) diff --git a/crates/sparrow-session/src/session.rs b/crates/sparrow-session/src/session.rs index e5b616342..89efb8dee 100644 --- a/crates/sparrow-session/src/session.rs +++ b/crates/sparrow-session/src/session.rs @@ -291,12 +291,11 @@ impl Session { /// /// This bypasses much of the plumbing of the [ExprOp] required due to our construction /// of the AST. - #[allow(unused)] pub fn add_udf_to_dfg( &mut self, udf: Arc, args: Vec, - ) -> error_stack::Result { + ) -> error_stack::Result { let signature = udf.signature(); let arg_names = signature.arg_names().to_owned(); signature.assert_valid_argument_count(args.len()); @@ -334,7 +333,7 @@ impl Session { .collect(); Err(Error::Errors(errors))? } else { - Ok(result) + Ok(Expr(result)) } } diff --git a/python/pysrc/kaskada/_ffi.pyi b/python/pysrc/kaskada/_ffi.pyi index d77c3ae58..aabc16f6a 100644 --- a/python/pysrc/kaskada/_ffi.pyi +++ b/python/pysrc/kaskada/_ffi.pyi @@ -25,6 +25,12 @@ class Expr: args: Sequence[Expr], ) -> Expr: ... @staticmethod + def call_udf( + session: Session, + udf: Udf, # TODO: ffi udf? + args: Sequence[Expr], + ) -> Expr: ... + @staticmethod def literal( session: Session, value: int | float | str | None, @@ -35,6 +41,7 @@ class Expr: def session(self) -> Session: ... def execute(self, options: Optional[ExecutionOptions] = None) -> Execution: ... def grouping(self) -> Optional[str]: ... + class Table(Expr): def __init__( diff --git a/python/pysrc/kaskada/_timestream.py b/python/pysrc/kaskada/_timestream.py index a074c0bd3..fb00abfba 100644 --- a/python/pysrc/kaskada/_timestream.py +++ b/python/pysrc/kaskada/_timestream.py @@ -67,72 +67,61 @@ def _literal(value: Literal, session: _ffi.Session) -> Timestream: raise TypeError("Cannot create a literal Timestream from a datetime") return Timestream(_ffi.Expr.literal(session, value)) - # @staticmethod - # def _test_apply_udf( - # signature: inspect.Signature, - # *args: Union[Timestream, Literal], - # session: Optional[_ffi.Session] = None, - # ) -> Timestream: - # """ - # todo - # """ - # if session is None: - # session = next( - # arg._ffi_expr.session() for arg in args if isinstance(arg, Timestream) - # ) - - # def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: - # if isinstance(arg, Timestream): - # return arg._ffi_expr - # else: - # return Timestream._literal(arg, session)._ffi_expr - - # ffi_args = [make_arg(arg) for arg in args] - - # arg_types = [param.annotation for param in signature.parameters.values()] - # result_type = signature.return_annotation - - # try: - # return Timestream( - # _ffi.Expr.udf(session=session, arg_types=arg_types, result_type=result_type, args=ffi_args) - # ) - # except TypeError as e: - # # noqa: DAR401 - # raise _augment_error(args, TypeError(str(e))) from e - # except ValueError as e: - # raise _augment_error(args, ValueError(str(e))) from e - - # @staticmethod - # def _test_decorator_udf( - # signature: inspect.Signature, - # *args: Union[Timestream, Literal], - # session: Optional[_ffi.Session] = None, - # ) -> Timestream: - # """ - # todo - # """ - # if session is None: - # session = next( - # arg._ffi_expr.session() for arg in args if isinstance(arg, Timestream) - # ) + @staticmethod + def _call_udf( + udf: _ffi.Udf, + *args: Union[Timestream, Literal], + session: Optional[_ffi.Session] = None, + ) -> Timestream: + """ + Construct a new Timestream by calling the given function. + + Parameters + ---------- + func : str + Name of the function to apply. + *args : Timestream | int | str | float | bool | None + List of arguments to the expression. + session : FFI Session + FFI Session to create the expression in. + If unspecified, will infer from the arguments. + Will fail if all arguments are literals and the session is not provided. + + Returns + ------- + Timestream + Timestream representing the result of the function applied to the arguments. - # def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: - # if isinstance(arg, Timestream): - # return arg._ffi_expr - # else: - # return Timestream._literal(arg, session)._ffi_expr + Raises + ------ + # noqa: DAR401 _augment_error + TypeError + If the argument types are invalid for the given function. + ValueError + If the argument values are invalid for the given function. + """ + if session is None: + session = next( + arg._ffi_expr.session() for arg in args if isinstance(arg, Timestream) + ) - # try: - # return Timestream( - # _ffi.Expr.decorator_udf(session=session, args=ffi_args) - # ) - # except TypeError as e: - # # noqa: DAR401 - # raise _augment_error(args, TypeError(str(e))) from e - # except ValueError as e: - # raise _augment_error(args, ValueError(str(e))) from e + def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: + if isinstance(arg, Timestream): + return arg._ffi_expr + else: + return Timestream._literal(arg, session)._ffi_expr + ffi_args = [make_arg(arg) for arg in args] + try: + return Timestream( + _ffi.Expr.call_udf(session=session, udf=udf, args=ffi_args) + ) + except TypeError as e: + # noqa: DAR401 + raise _augment_error(args, TypeError(str(e))) from e + except ValueError as e: + raise _augment_error(args, ValueError(str(e))) from e @staticmethod @@ -182,7 +171,6 @@ def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: ffi_args = [make_arg(arg) for arg in args] try: return Timestream( - # TODO: FRAZ - so I need a `call` that can take the udf _ffi.Expr.call(session=session, operation=func, args=ffi_args) ) except TypeError as e: diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index df699bb62..6121d0f8b 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -23,7 +23,7 @@ def __init__(self, signature: str, func: Callable[..., pa.Array]) -> None: def __call__(self, *args: Timestream | Literal) -> Timestream: """Apply the UDF to the given arguments.""" # TODO: Timestream>_udf_call? - Timestream._call(self._ffi, *args) + Timestream._call_udf(self._ffi, *args) def udf(signature: str): """Decorate a function for use as a Kaskada UDF.""" diff --git a/python/src/expr.rs b/python/src/expr.rs index 0acc47464..4a19ab3c9 100644 --- a/python/src/expr.rs +++ b/python/src/expr.rs @@ -1,10 +1,12 @@ use crate::error::Result; use crate::execution::Execution; use crate::session::Session; +use crate::udf::Udf; use arrow::datatypes::DataType; use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; +use std::sync::Arc; use sparrow_session::{Expr as RustExpr, Literal, Session as RustSession}; /// Kaskada expression node. @@ -44,6 +46,33 @@ impl Expr { Ok(Self { rust_expr, session }) } + /// Create a new udf. + /// + /// This creates a new udf based on the `udf` and `args` provided. + #[staticmethod] + #[pyo3(signature = (session, udf, args))] + fn call_udf(session: Session, udf: Udf, args: Vec) -> PyResult { + let sparrow_udf = udf.0; + if !args.iter().all(|e| e.session() == session) { + return Err(PyValueError::new_err( + "all arguments must be in the same session", + )); + } + + let mut rust_session = session.rust_session()?; + let args: Vec<_> = args.into_iter().map(|e| e.rust_expr).collect(); + let rust_expr = match rust_session.add_udf_to_dfg(sparrow_udf.clone(), args) { + Ok(node) => node, + Err(e) => { + // DO NOT SUBMIT: Better error handling. + return Err(PyValueError::new_err(e.to_string())); + } + }; + std::mem::drop(rust_session); + + Ok(Self { rust_expr, session }) + } + #[staticmethod] #[pyo3(signature = (session, value))] fn literal(session: Session, value: Option) -> PyResult { diff --git a/python/src/udf.rs b/python/src/udf.rs index db9c525de..bfffdf640 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -25,7 +25,8 @@ struct PyUdfEvaluator { #[derive(Clone)] #[pyclass] -pub(crate) struct Udf(Arc); +// TODO: Does it need to extend Expr for some reason? +pub(crate) struct Udf(pub Arc); impl sparrow_instructions::Udf for PyUdf { fn signature(&self) -> &Signature { @@ -116,6 +117,7 @@ impl PyUdfEvaluator { #[pymethods] impl Udf { #[new] + #[pyo3(signature = (signature, callable))] fn new(signature: String, callable: Py) -> Self { let uuid = Uuid::new_v4(); // TODO: sig name string cannot be &'static str From f75050f976180c8507b459de87c2a236978de41a Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Wed, 23 Aug 2023 00:19:53 -0700 Subject: [PATCH 04/18] Fix is_new and value mixup --- crates/sparrow-compiler/src/ast_to_dfg.rs | 5 +++-- .../sparrow-compiler/src/plan/expression_to_plan.rs | 7 ++++++- crates/sparrow-instructions/src/evaluators.rs | 5 ++++- .../src/execute/operation/expression_executor.rs | 6 ++++++ crates/sparrow-session/src/session.rs | 1 + python/pysrc/kaskada/udf.py | 4 ++-- python/src/expr.rs | 2 +- python/src/udf.rs | 11 +++++------ 8 files changed, 28 insertions(+), 13 deletions(-) diff --git a/crates/sparrow-compiler/src/ast_to_dfg.rs b/crates/sparrow-compiler/src/ast_to_dfg.rs index 2b05e12f8..84be28e3a 100644 --- a/crates/sparrow-compiler/src/ast_to_dfg.rs +++ b/crates/sparrow-compiler/src/ast_to_dfg.rs @@ -104,6 +104,7 @@ pub fn add_udf_to_dfg( data_context: &mut DataContext, diagnostics: &mut DiagnosticCollector<'_>, ) -> anyhow::Result { + println!("sparrow_compiler::add_udf_to_dfg"); let argument_types = arguments.transform(|i| i.with_value(i.value_type().clone())); let signature = udf.signature(); @@ -141,8 +142,8 @@ pub fn add_udf_to_dfg( }) .try_collect()?; - let is_new = dfg.add_udf(udf.clone(), args.iter().map(|i| i.value()).collect())?; - let value = is_any_new(dfg, &args)?; + let value = dfg.add_udf(udf.clone(), args.iter().map(|i| i.value()).collect())?; + let is_new = is_any_new(dfg, &args)?; let time_domain_check = TimeDomainCheck::Compatible; let time_domain = diff --git a/crates/sparrow-compiler/src/plan/expression_to_plan.rs b/crates/sparrow-compiler/src/plan/expression_to_plan.rs index d767dcb90..ee9b8b2d7 100644 --- a/crates/sparrow-compiler/src/plan/expression_to_plan.rs +++ b/crates/sparrow-compiler/src/plan/expression_to_plan.rs @@ -35,11 +35,16 @@ pub(super) fn dfg_to_plan( InstKind::FieldRef => "field_ref".to_owned(), InstKind::Record => "record".to_owned(), InstKind::Cast(_) => "cast".to_owned(), - InstKind::Udf(udf) => udf.signature().name().to_owned(), + InstKind::Udf(udf) => { + println!("dfg_to_plan udf: {:?}", udf.signature().name()); + udf.signature().name().to_owned() + // TODO: Do we need to serialize the callable into the plan? + } }; let result_type = infer_result_type(inst, &arguments, plan_builder.expressions(operation_index)?)?; + println!("result type: {:?}", result_type); let operator = expression_plan::Operator::Instruction(name); (operator, result_type) diff --git a/crates/sparrow-instructions/src/evaluators.rs b/crates/sparrow-instructions/src/evaluators.rs index 719748aa7..4bf096720 100644 --- a/crates/sparrow-instructions/src/evaluators.rs +++ b/crates/sparrow-instructions/src/evaluators.rs @@ -160,7 +160,10 @@ pub fn create_evaluator(info: StaticInfo<'_>) -> anyhow::Result Ok(RecordEvaluator::try_new(info)?), - InstKind::Udf(udf) => Ok(udf.make_evaluator(info)), + InstKind::Udf(udf) => { + println!("Making udf evaluator"); + Ok(udf.make_evaluator(info)) + } } } diff --git a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs index 33b6c4944..2bf0645a6 100644 --- a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs +++ b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs @@ -95,6 +95,12 @@ impl ExpressionExecutor { Operator::Instruction(inst) => { // TODO: This could probably be cleaned up, possibly by eliminating // `inst kind` entirely. + println!("Inst: {:?}", inst); + // TODO: OH SHIT LOL + // It's called `add`, which is a built-in function, so it's just doing add + // HAHAHA + // This needs to make the udf + // but hm...how? let inst_kind = if inst == "field_ref" { InstKind::FieldRef } else if inst == "record" { diff --git a/crates/sparrow-session/src/session.rs b/crates/sparrow-session/src/session.rs index 89efb8dee..099fd78cd 100644 --- a/crates/sparrow-session/src/session.rs +++ b/crates/sparrow-session/src/session.rs @@ -296,6 +296,7 @@ impl Session { udf: Arc, args: Vec, ) -> error_stack::Result { + println!("session::add_udf_to_dfg"); let signature = udf.signature(); let arg_names = signature.arg_names().to_owned(); signature.assert_valid_argument_count(args.len()); diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index 6121d0f8b..421fec992 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -22,8 +22,8 @@ def __init__(self, signature: str, func: Callable[..., pa.Array]) -> None: def __call__(self, *args: Timestream | Literal) -> Timestream: """Apply the UDF to the given arguments.""" - # TODO: Timestream>_udf_call? - Timestream._call_udf(self._ffi, *args) + print("Calling udf") + return Timestream._call_udf(self._ffi, *args) def udf(signature: str): """Decorate a function for use as a Kaskada UDF.""" diff --git a/python/src/expr.rs b/python/src/expr.rs index 4a19ab3c9..a623580f0 100644 --- a/python/src/expr.rs +++ b/python/src/expr.rs @@ -33,7 +33,6 @@ impl Expr { let mut rust_session = session.rust_session()?; let args: Vec<_> = args.into_iter().map(|e| e.rust_expr).collect(); - // TODO: - Support adding a UDF here. let rust_expr = match rust_session.add_expr(&operation, args) { Ok(node) => node, Err(e) => { @@ -52,6 +51,7 @@ impl Expr { #[staticmethod] #[pyo3(signature = (session, udf, args))] fn call_udf(session: Session, udf: Udf, args: Vec) -> PyResult { + println!("SparrowExpr::call_udf"); let sparrow_udf = udf.0; if !args.iter().all(|e| e.session() == session) { return Err(PyValueError::new_err( diff --git a/python/src/udf.rs b/python/src/udf.rs index bfffdf640..6c4a97500 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -35,13 +35,12 @@ impl sparrow_instructions::Udf for PyUdf { // FRAZ: Should we allow this to return an error? fn make_evaluator(&self, info: StaticInfo<'_>) -> Box { - println!("Making udf evaluator"); - // Idea: Create a random value of each argument type and verify that the callbale + // TODO: Idea: Create a random value of each argument type and verify that the callbale // is invokable on that. This would let us detect errors early. + println!("Making python udf evaluator"); let args = info.args.iter().map(|arg| arg.value_ref.clone()).collect(); - // FRAZ: Get the value refs for each static arg. Box::new(PyUdfEvaluator { args, callable: self.callable.clone(), @@ -55,6 +54,7 @@ impl sparrow_instructions::Udf for PyUdf { impl Evaluator for PyUdfEvaluator { fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { + println!("Evaluating Python UDF"); let args = self.args .iter() .map(|value_ref| -> anyhow::Result<_> { @@ -62,7 +62,7 @@ impl Evaluator for PyUdfEvaluator { }).try_collect()?; let result = Python::with_gil(|py| -> anyhow::Result { - // todo: uhh + // todo: Ok(self.evaluate_py(py, args)) })?; // FRAZ: handle python errors. @@ -72,8 +72,6 @@ impl Evaluator for PyUdfEvaluator { impl PyUdfEvaluator { fn evaluate_py<'py>(&self, py: Python<'py>, args: Vec) -> ArrayRef { - println!("Evaluating python"); - // self.callable.call1(py, (args,)).unwrap().extract(py).unwrap(); // let args: Vec = args // .iter() @@ -119,6 +117,7 @@ impl Udf { #[new] #[pyo3(signature = (signature, callable))] fn new(signature: String, callable: Py) -> Self { + println!("Creating udf with signature: {:?}", signature); let uuid = Uuid::new_v4(); // TODO: sig name string cannot be &'static str let signature = Signature::try_from_str(FeatureSetPart::Function("udf"), &signature).expect("signature to parse"); From 9ba704584179a450bf9413526858586ce788f242 Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Wed, 23 Aug 2023 00:21:30 -0700 Subject: [PATCH 05/18] fix comment --- .../src/execute/operation/expression_executor.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs index 2bf0645a6..184bacb54 100644 --- a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs +++ b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs @@ -96,11 +96,9 @@ impl ExpressionExecutor { // TODO: This could probably be cleaned up, possibly by eliminating // `inst kind` entirely. println!("Inst: {:?}", inst); - // TODO: OH SHIT LOL + // TODO: FRAZ // It's called `add`, which is a built-in function, so it's just doing add - // HAHAHA - // This needs to make the udf - // but hm...how? + // This needs to make the udf instead. But how? Do we serialize the Arc? let inst_kind = if inst == "field_ref" { InstKind::FieldRef } else if inst == "record" { From 2664783cf7b7b604ac332beaaf005e1bdb1c005f Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Wed, 23 Aug 2023 14:13:39 -0700 Subject: [PATCH 06/18] don't hold gil for entire collect pyarrow method --- Cargo.lock | 1 + .../src/plan/expression_to_plan.rs | 4 +-- crates/sparrow-runtime/src/execute.rs | 34 ++++++++++++++++--- .../src/execute/compute_executor.rs | 5 +++ .../sparrow-runtime/src/execute/operation.rs | 17 +++++++--- .../execute/operation/expression_executor.rs | 19 +++++++---- crates/sparrow-session/Cargo.toml | 1 + crates/sparrow-session/src/session.rs | 12 +++++-- python/Cargo.lock | 1 + python/Pipfile | 11 ++++++ python/pysrc/kaskada/_result.py | 1 + .../golden/udf_test/test_add_udf_direct.jsonl | 6 ++++ .../golden/udf_test/test_add_udf_pipe.jsonl | 6 ++++ python/src/execution.rs | 16 +++++---- python/src/expr.rs | 3 +- python/src/udf.rs | 27 ++++++++------- 16 files changed, 125 insertions(+), 39 deletions(-) create mode 100644 python/Pipfile create mode 100644 python/pytests/golden/udf_test/test_add_udf_direct.jsonl create mode 100644 python/pytests/golden/udf_test/test_add_udf_pipe.jsonl diff --git a/Cargo.lock b/Cargo.lock index a3ef7d2fa..357b82c94 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4774,6 +4774,7 @@ dependencies = [ "derive_more", "error-stack", "futures", + "hashbrown 0.14.0", "itertools 0.11.0", "smallvec", "sparrow-api", diff --git a/crates/sparrow-compiler/src/plan/expression_to_plan.rs b/crates/sparrow-compiler/src/plan/expression_to_plan.rs index ee9b8b2d7..99495e4fe 100644 --- a/crates/sparrow-compiler/src/plan/expression_to_plan.rs +++ b/crates/sparrow-compiler/src/plan/expression_to_plan.rs @@ -37,14 +37,12 @@ pub(super) fn dfg_to_plan( InstKind::Cast(_) => "cast".to_owned(), InstKind::Udf(udf) => { println!("dfg_to_plan udf: {:?}", udf.signature().name()); - udf.signature().name().to_owned() - // TODO: Do we need to serialize the callable into the plan? + udf.uuid().to_string() } }; let result_type = infer_result_type(inst, &arguments, plan_builder.expressions(operation_index)?)?; - println!("result type: {:?}", result_type); let operator = expression_plan::Operator::Instruction(name); (operator, result_type) diff --git a/crates/sparrow-runtime/src/execute.rs b/crates/sparrow-runtime/src/execute.rs index 915057ef1..45630ad7d 100644 --- a/crates/sparrow-runtime/src/execute.rs +++ b/crates/sparrow-runtime/src/execute.rs @@ -4,6 +4,7 @@ use chrono::NaiveDateTime; use enum_map::EnumMap; use error_stack::{IntoReport, IntoReportCompat, ResultExt}; use futures::Stream; +use hashbrown::HashMap; use prost_wkt_types::Timestamp; use sparrow_api::kaskada::v1alpha::execute_request::Limits; use sparrow_api::kaskada::v1alpha::{ @@ -12,7 +13,9 @@ use sparrow_api::kaskada::v1alpha::{ }; use sparrow_arrow::scalar_value::ScalarValue; use sparrow_compiler::{hash_compute_plan_proto, DataContext}; +use sparrow_instructions::Udf; use sparrow_qfr::kaskada::sparrow::v1alpha::FlightRecordHeader; +use uuid::Uuid; use crate::execute::compute_store_guard::ComputeStoreGuard; use crate::execute::error::Error; @@ -67,9 +70,15 @@ pub async fn execute( ..ExecutionOptions::default() }; - // let output_at_time = request.final_result_time; - - execute_new(plan, destination, data_context, options, None).await + execute_new( + plan, + destination, + data_context, + options, + None, + &HashMap::new(), + ) + .await } #[derive(Default, Debug)] @@ -215,12 +224,18 @@ async fn load_key_hash_inverse( /// ---------- /// - key_hash_inverse: If set, specifies the key hash inverses to use. If None, the /// key hashes will be created. +/// - udfs: contains the mapping of uuid to udf implementation. This is currently used +/// so we can serialize the uuid to the ComputePlan, then look up what implementation to use +/// when creating the evaluator. This works because we are on a single machine, and don't need +/// to pass the plan around. However, we'll eventually need to look into serializing/pickling +/// the callable. pub async fn execute_new( plan: ComputePlan, destination: Destination, mut data_context: DataContext, options: ExecutionOptions, key_hash_inverse: Option>, + udfs: &HashMap>, ) -> error_stack::Result>, Error> { let object_stores = Arc::new(ObjectStoreRegistry::default()); @@ -287,6 +302,7 @@ pub async fn execute_new( progress_updates_rx, destination, options.stop_signal_rx, + udfs, ) .await .change_context(Error::internal_msg("spawn compute executor"))?; @@ -327,5 +343,15 @@ pub async fn materialize( // TODO: the `execute_with_progress` method contains a lot of additional logic that is theoretically not needed, // as the materialization does not exit, and should not need to handle cleanup tasks that regular // queries do. We should likely refactor this to use a separate `materialize_with_progress` method. - execute_new(plan, destination, data_context, options, None).await + + // TODO: Materialize with udfs + execute_new( + plan, + destination, + data_context, + options, + None, + &HashMap::new(), + ) + .await } diff --git a/crates/sparrow-runtime/src/execute/compute_executor.rs b/crates/sparrow-runtime/src/execute/compute_executor.rs index 479ee2ef6..fb63354e9 100644 --- a/crates/sparrow-runtime/src/execute/compute_executor.rs +++ b/crates/sparrow-runtime/src/execute/compute_executor.rs @@ -5,16 +5,19 @@ use enum_map::EnumMap; use error_stack::{IntoReport, IntoReportCompat, ResultExt}; use futures::stream::{FuturesUnordered, PollNext}; use futures::{FutureExt, Stream, TryFutureExt}; +use hashbrown::HashMap; use prost_wkt_types::Timestamp; use sparrow_api::kaskada::v1alpha::ComputeSnapshot; use sparrow_api::kaskada::v1alpha::{ExecuteResponse, LateBoundValue, PlanHash}; use sparrow_arrow::scalar_value::ScalarValue; +use sparrow_instructions::Udf; use sparrow_qfr::io::writer::FlightRecordWriter; use sparrow_qfr::kaskada::sparrow::v1alpha::FlightRecordHeader; use sparrow_qfr::FlightRecorderFactory; use tokio_stream::wrappers::UnboundedReceiverStream; use tokio_stream::StreamExt; use tracing::{error, info, info_span, Instrument}; +use uuid::Uuid; use crate::execute::compute_store_guard::ComputeStoreGuard; use crate::execute::operation::{OperationContext, OperationExecutor}; @@ -56,6 +59,7 @@ impl ComputeExecutor { progress_updates_rx: tokio::sync::mpsc::Receiver, destination: Destination, stop_signal_rx: Option>, + udfs: &HashMap>, ) -> error_stack::Result { let mut spawner = ComputeTaskSpawner::new(); @@ -147,6 +151,7 @@ impl ComputeExecutor { max_event_time_tx.clone(), late_bindings, stop_signal_rx.clone(), + &udfs, ) .await?, ); diff --git a/crates/sparrow-runtime/src/execute/operation.rs b/crates/sparrow-runtime/src/execute/operation.rs index f7d09c0c4..679f5b3af 100644 --- a/crates/sparrow-runtime/src/execute/operation.rs +++ b/crates/sparrow-runtime/src/execute/operation.rs @@ -42,14 +42,16 @@ use chrono::NaiveDateTime; use enum_map::EnumMap; use error_stack::{IntoReport, IntoReportCompat, Report, Result, ResultExt}; use futures::Future; +use hashbrown::HashMap; use prost_wkt_types::Timestamp; use sparrow_api::kaskada::v1alpha::operation_plan::tick_operation::TickBehavior; use sparrow_api::kaskada::v1alpha::{operation_plan, ComputePlan, LateBoundValue, OperationPlan}; use sparrow_arrow::scalar_value::ScalarValue; use sparrow_compiler::DataContext; -use sparrow_instructions::ComputeStore; +use sparrow_instructions::{ComputeStore, Udf}; use tokio::task::JoinHandle; use tracing::Instrument; +use uuid::Uuid; use self::final_tick::FinalTickOperation; use self::input_batch::InputBatch; @@ -176,6 +178,7 @@ impl OperationExecutor { max_event_time_tx: tokio::sync::mpsc::UnboundedSender, late_bindings: &EnumMap>, stop_signal_rx: Option>, + udfs: &HashMap>, ) -> Result>, Error> { let Self { operation, @@ -188,10 +191,14 @@ impl OperationExecutor { let operation_label = operator.label(); - let mut expression_executor = - ExpressionExecutor::try_new(operation_label, operation.expressions, late_bindings) - .into_report() - .change_context(Error::internal_msg("unable to create executor"))?; + let mut expression_executor = ExpressionExecutor::try_new( + operation_label, + operation.expressions, + late_bindings, + udfs, + ) + .into_report() + .change_context(Error::internal_msg("unable to create executor"))?; debug_assert_eq!(operator.input_len(), input_channels.len()); diff --git a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs index 184bacb54..b8030d181 100644 --- a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs +++ b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs @@ -6,15 +6,17 @@ use arrow::array::ArrayRef; use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; use arrow::record_batch::RecordBatch; use enum_map::EnumMap; +use hashbrown::HashMap; use itertools::Itertools; use sparrow_api::kaskada::v1alpha::expression_plan::Operator; use sparrow_api::kaskada::v1alpha::{ExpressionPlan, LateBoundValue, OperationInputRef}; use sparrow_arrow::scalar_value::ScalarValue; -use sparrow_instructions::ValueRef; use sparrow_instructions::{ create_evaluator, ColumnarValue, ComputeStore, Evaluator, GroupingIndices, InstKind, InstOp, RuntimeInfo, StaticArg, StaticInfo, StoreKey, }; +use sparrow_instructions::{Udf, ValueRef}; +use uuid::Uuid; use crate::execute::operation::InputBatch; use crate::Batch; @@ -49,6 +51,7 @@ impl ExpressionExecutor { operation_label: &'static str, expressions: Vec, late_bindings: &EnumMap>, + udfs: &HashMap>, ) -> anyhow::Result { let mut input_columns = Vec::new(); @@ -96,9 +99,6 @@ impl ExpressionExecutor { // TODO: This could probably be cleaned up, possibly by eliminating // `inst kind` entirely. println!("Inst: {:?}", inst); - // TODO: FRAZ - // It's called `add`, which is a built-in function, so it's just doing add - // This needs to make the udf instead. But how? Do we serialize the Arc? let inst_kind = if inst == "field_ref" { InstKind::FieldRef } else if inst == "record" { @@ -106,8 +106,15 @@ impl ExpressionExecutor { } else if inst == "cast" { InstKind::Cast(data_type.clone()) } else { - let inst_op = InstOp::from_str(&inst)?; - InstKind::Simple(inst_op) + // This assumes we'll never have an InstOp function name that + // matches a uuid, which should be safe. + if let Ok(uuid) = Uuid::from_str(&inst) { + let udf = udfs.get(&uuid).ok_or(anyhow::anyhow!("expected udf"))?; + InstKind::Udf(udf.clone()) + } else { + let inst_op = InstOp::from_str(&inst)?; + InstKind::Simple(inst_op) + } }; let static_info = StaticInfo::new(&inst_kind, args, &data_type); diff --git a/crates/sparrow-session/Cargo.toml b/crates/sparrow-session/Cargo.toml index fde919198..117bcab16 100644 --- a/crates/sparrow-session/Cargo.toml +++ b/crates/sparrow-session/Cargo.toml @@ -11,6 +11,7 @@ The Sparrow session builder. [dependencies] arrow-array.workspace = true +hashbrown.workspace = true arrow-schema.workspace = true arrow-select.workspace = true derive_more.workspace = true diff --git a/crates/sparrow-session/src/session.rs b/crates/sparrow-session/src/session.rs index 099fd78cd..46cac7d28 100644 --- a/crates/sparrow-session/src/session.rs +++ b/crates/sparrow-session/src/session.rs @@ -1,5 +1,5 @@ +use hashbrown::HashMap; use std::borrow::Cow; -use std::collections::HashMap; use std::sync::Arc; use arrow_schema::SchemaRef; @@ -25,6 +25,13 @@ pub struct Session { data_context: DataContext, dfg: Dfg, key_hash_inverse: HashMap>, + /// Keeps track of the uuid mapping. + /// + /// This is a bit of a hack that allows to serialize the uuid instead of the + /// `Arc>, } #[derive(Default)] @@ -296,7 +303,6 @@ impl Session { udf: Arc, args: Vec, ) -> error_stack::Result { - println!("session::add_udf_to_dfg"); let signature = udf.signature(); let arg_names = signature.arg_names().to_owned(); signature.assert_valid_argument_count(args.len()); @@ -334,6 +340,7 @@ impl Session { .collect(); Err(Error::Errors(errors))? } else { + self.udfs.insert(udf.uuid().clone(), udf.clone()); Ok(Expr(result)) } } @@ -416,6 +423,7 @@ impl Session { data_context, options, Some(key_hash_inverse), + &self.udfs, )) .change_context(Error::Execute)? .map_err(|e| e.change_context(Error::Execute)) diff --git a/python/Cargo.lock b/python/Cargo.lock index 0dae8ea3c..754ff6bff 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -3961,6 +3961,7 @@ dependencies = [ "derive_more", "error-stack", "futures", + "hashbrown 0.14.0", "itertools 0.11.0", "smallvec", "sparrow-api", diff --git a/python/Pipfile b/python/Pipfile new file mode 100644 index 000000000..71e4f7cbf --- /dev/null +++ b/python/Pipfile @@ -0,0 +1,11 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] + +[dev-packages] + +[requires] +python_version = "3.9" diff --git a/python/pysrc/kaskada/_result.py b/python/pysrc/kaskada/_result.py index eea4e04ac..4377a5dcf 100644 --- a/python/pysrc/kaskada/_result.py +++ b/python/pysrc/kaskada/_result.py @@ -46,6 +46,7 @@ def to_pyarrow(self) -> pa.Table: This method will block on the complete results of the query and collect all results into memory. If this is not desired, use `iter_pyarrow` instead. """ + print("Converting to_pyarrow") batches = self._ffi_execution.collect_pyarrow() if len(batches) == 0: return pa.Table.from_batches([], schema=pa.schema([])) diff --git a/python/pytests/golden/udf_test/test_add_udf_direct.jsonl b/python/pytests/golden/udf_test/test_add_udf_direct.jsonl new file mode 100644 index 000000000..5f101ddf1 --- /dev/null +++ b/python/pytests/golden/udf_test/test_add_udf_direct.jsonl @@ -0,0 +1,6 @@ +{"_time":"1996-12-19T16:39:57.000000000","_key":"A","result":15.0} +{"_time":"1996-12-19T16:39:58.000000000","_key":"B","result":27.0} +{"_time":"1996-12-19T16:39:59.000000000","_key":"A","result":23.0} +{"_time":"1996-12-19T16:40:00.000000000","_key":"A","result":null} +{"_time":"1996-12-19T16:40:01.000000000","_key":"A","result":null} +{"_time":"1996-12-19T16:40:02.000000000","_key":"A","result":23.0} diff --git a/python/pytests/golden/udf_test/test_add_udf_pipe.jsonl b/python/pytests/golden/udf_test/test_add_udf_pipe.jsonl new file mode 100644 index 000000000..358c577c5 --- /dev/null +++ b/python/pytests/golden/udf_test/test_add_udf_pipe.jsonl @@ -0,0 +1,6 @@ +{"_time":"1996-12-19T16:39:57.000000000","_key":"A","result":15.0} +{"_time":"1996-12-19T16:39:58.000000000","_key":"B","result":27.0} +{"_time":"1996-12-19T16:39:59.000000000","_key":"A","result":23.0} +{"_time":"1996-12-19T16:40:00.000000000","_key":"A","result":null} +{"_time":"1996-12-19T16:40:01.000000000","_key":"A","result":null} +{"_time":"1996-12-19T16:40:02.000000000","_key":"A","result":null} diff --git a/python/src/execution.rs b/python/src/execution.rs index 51b663289..c49db64be 100644 --- a/python/src/execution.rs +++ b/python/src/execution.rs @@ -53,14 +53,18 @@ impl Execution {} #[pymethods] impl Execution { - fn collect_pyarrow(&mut self, py: Python<'_>) -> Result> { + fn collect_pyarrow(&mut self) -> Result> { let execution = self.take_execution()?; let batches = execution.collect_all_blocking()?; - let results = batches - .into_iter() - .map(|batch| batch.to_pyarrow(py)) - .collect::>>()?; - Ok(results) + println!("Execution::collect_pyarrow: batches.len() = {}", batches.len()); + let result = Python::with_gil(|py| -> PyResult> { + let results = batches + .into_iter() + .map(|batch| batch.to_pyarrow(py)) + .collect::>>()?; + Ok(results) + }); + result.map_err(|_| error_stack::report!(ErrorContext::Python).into()) } fn next_pyarrow(&mut self, py: Python<'_>) -> Result> { diff --git a/python/src/expr.rs b/python/src/expr.rs index a623580f0..d0f5111ca 100644 --- a/python/src/expr.rs +++ b/python/src/expr.rs @@ -6,8 +6,8 @@ use arrow::datatypes::DataType; use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; -use std::sync::Arc; use sparrow_session::{Expr as RustExpr, Literal, Session as RustSession}; +use std::sync::Arc; /// Kaskada expression node. #[derive(Clone)] @@ -61,6 +61,7 @@ impl Expr { let mut rust_session = session.rust_session()?; let args: Vec<_> = args.into_iter().map(|e| e.rust_expr).collect(); + let rust_expr = match rust_session.add_udf_to_dfg(sparrow_udf.clone(), args) { Ok(node) => node, Err(e) => { diff --git a/python/src/udf.rs b/python/src/udf.rs index 6c4a97500..65059a91b 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -1,8 +1,8 @@ +use itertools::Itertools; use std::fmt::Debug; use std::sync::Arc; -use itertools::Itertools; -use arrow::array::{ArrayData, ArrayRef}; +use arrow::array::{new_empty_array, ArrayData, ArrayRef}; use arrow::datatypes::DataType; use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::prelude::*; @@ -25,7 +25,7 @@ struct PyUdfEvaluator { #[derive(Clone)] #[pyclass] -// TODO: Does it need to extend Expr for some reason? +// TODO: Does it need to extend Expr for some reason? pub(crate) struct Udf(pub Arc); impl sparrow_instructions::Udf for PyUdf { @@ -55,14 +55,15 @@ impl sparrow_instructions::Udf for PyUdf { impl Evaluator for PyUdfEvaluator { fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { println!("Evaluating Python UDF"); - let args = self.args + let args = self + .args .iter() - .map(|value_ref| -> anyhow::Result<_> { - Ok(info.value(value_ref)?.array_ref()?) - }).try_collect()?; - + .map(|value_ref| -> anyhow::Result<_> { Ok(info.value(value_ref)?.array_ref()?) }) + .try_collect()?; + let result = Python::with_gil(|py| -> anyhow::Result { - // todo: + // todo: + println!("inside gil"); Ok(self.evaluate_py(py, args)) })?; // FRAZ: handle python errors. @@ -73,7 +74,7 @@ impl Evaluator for PyUdfEvaluator { impl PyUdfEvaluator { fn evaluate_py<'py>(&self, py: Python<'py>, args: Vec) -> ArrayRef { // self.callable.call1(py, (args,)).unwrap().extract(py).unwrap(); - // let args: Vec = args + // let args: Vec = args // .iter() // .map::, _>(|value_ref| { // let value: ColumnarValue = info.value(value_ref)?; @@ -108,7 +109,8 @@ impl PyUdfEvaluator { // "unexpected result length from Python UDF" // ); - panic!("arrayref") + println!("array ref panic?"); + new_empty_array(&DataType::Int32) } } @@ -120,7 +122,8 @@ impl Udf { println!("Creating udf with signature: {:?}", signature); let uuid = Uuid::new_v4(); // TODO: sig name string cannot be &'static str - let signature = Signature::try_from_str(FeatureSetPart::Function("udf"), &signature).expect("signature to parse"); + let signature = Signature::try_from_str(FeatureSetPart::Function("udf"), &signature) + .expect("signature to parse"); let udf = PyUdf { signature, callable, From b38a71c42a9eea841fe45a0d6fe8ccac9b255d93 Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Wed, 23 Aug 2023 14:28:32 -0700 Subject: [PATCH 07/18] Fix gil handling in collect --- python/src/execution.rs | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/python/src/execution.rs b/python/src/execution.rs index c49db64be..19d3e0d46 100644 --- a/python/src/execution.rs +++ b/python/src/execution.rs @@ -53,18 +53,21 @@ impl Execution {} #[pymethods] impl Execution { - fn collect_pyarrow(&mut self) -> Result> { - let execution = self.take_execution()?; - let batches = execution.collect_all_blocking()?; - println!("Execution::collect_pyarrow: batches.len() = {}", batches.len()); - let result = Python::with_gil(|py| -> PyResult> { - let results = batches - .into_iter() - .map(|batch| batch.to_pyarrow(py)) - .collect::>>()?; - Ok(results) - }); - result.map_err(|_| error_stack::report!(ErrorContext::Python).into()) + fn collect_pyarrow(&mut self, py: Python<'_>) -> Result> { + let execution = self.take_execution().unwrap(); + + // Explicitly allow threads to take the gil during execution, so + // the udf evaluator can acquire it to execute python. + let batches = py.allow_threads(move || { + execution.collect_all_blocking() + })?; + + let results = batches + .into_iter() + .map(|batch| batch.to_pyarrow(py)) + .collect::>>()?; + Ok(results) + // result.map_err(|_| error_stack::report!(ErrorContext::Python).into()) } fn next_pyarrow(&mut self, py: Python<'_>) -> Result> { From fccc9fb380bed28334ce6106f62a83ddeff9f5db Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Thu, 24 Aug 2023 12:05:24 -0700 Subject: [PATCH 08/18] Attemping to invoke callable -- arguments aren't correct yet --- crates/sparrow-instructions/src/evaluators.rs | 2 +- python/pysrc/kaskada/_ffi.pyi | 2 +- python/pysrc/kaskada/_timestream.py | 10 -- python/pysrc/kaskada/udf.py | 13 +- .../golden/udf_test/test_add_udf_direct.jsonl | 6 - .../golden/udf_test/test_add_udf_pipe.jsonl | 6 - python/src/execution.rs | 7 +- python/src/udf.rs | 122 +++++++++++------- 8 files changed, 87 insertions(+), 81 deletions(-) delete mode 100644 python/pytests/golden/udf_test/test_add_udf_direct.jsonl delete mode 100644 python/pytests/golden/udf_test/test_add_udf_pipe.jsonl diff --git a/crates/sparrow-instructions/src/evaluators.rs b/crates/sparrow-instructions/src/evaluators.rs index 4bf096720..d0cc6950f 100644 --- a/crates/sparrow-instructions/src/evaluators.rs +++ b/crates/sparrow-instructions/src/evaluators.rs @@ -45,7 +45,7 @@ use time::*; pub struct StaticInfo<'a> { inst_kind: &'a InstKind, pub args: Vec, - result_type: &'a DataType, + pub result_type: &'a DataType, } impl<'a> StaticInfo<'a> { diff --git a/python/pysrc/kaskada/_ffi.pyi b/python/pysrc/kaskada/_ffi.pyi index aabc16f6a..80423ba4c 100644 --- a/python/pysrc/kaskada/_ffi.pyi +++ b/python/pysrc/kaskada/_ffi.pyi @@ -27,7 +27,7 @@ class Expr: @staticmethod def call_udf( session: Session, - udf: Udf, # TODO: ffi udf? + udf: Udf, args: Sequence[Expr], ) -> Expr: ... @staticmethod diff --git a/python/pysrc/kaskada/_timestream.py b/python/pysrc/kaskada/_timestream.py index fb00abfba..1649b54c0 100644 --- a/python/pysrc/kaskada/_timestream.py +++ b/python/pysrc/kaskada/_timestream.py @@ -257,16 +257,6 @@ def pipe( else: return func(self, *args, **kwargs) -# TODO: How does udf work? We kind of want it like... apply some function on an input? -# But also want like `Timestream.udf(f, args*)` - # def udf(callable: Callable, *args: Timestream) -> Timestream: - # """ - # todo - # """ - # signature = inspect.signature(callable) - - # return Timestream._test_udf(signature, args) - def add(self, rhs: Union[Timestream, Literal]) -> Timestream: """ Create a Timestream adding this and `rhs`. diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index 421fec992..32ba28ece 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -33,7 +33,8 @@ def decorator(func: FuncType): # This needs to take the PyArrow result type and PyArrow inputs, # convert them to Pandas, call the function, and convert the result # back to PyArrow. - func = lambda *args: _converted_func(func, *args) + func = lambda result_type, *args: _converted_func(func, result_type, *args) + # func = lambda *args: _converted_func(func, pa.int64(), *args) # 2. Create the UDF object. return Udf(signature, func) @@ -45,8 +46,14 @@ def _converted_func(func: FuncType, result_type: pa.DataType, *args: pa.Array) - # TODO: I believe this will return a series for simple arrays, and a # dataframe for struct arrays. We should explore how this handles # different types. - pd_args = [arg.to_pandas() for arg in args] - pd_result = func(*pd_args) + print("CONVERTING FUNC") + print(f"Result type: ", result_type) + print(f"Args: ", args) + print() + + # pd_args = [arg.to_pandas() for arg in args] + # pd_result = func(*pd_args) + pd_result = func(*args) if isinstance(pd_result, pd.Series): return pa.Array.from_pandas(pd_result, type=result_type) diff --git a/python/pytests/golden/udf_test/test_add_udf_direct.jsonl b/python/pytests/golden/udf_test/test_add_udf_direct.jsonl deleted file mode 100644 index 5f101ddf1..000000000 --- a/python/pytests/golden/udf_test/test_add_udf_direct.jsonl +++ /dev/null @@ -1,6 +0,0 @@ -{"_time":"1996-12-19T16:39:57.000000000","_key":"A","result":15.0} -{"_time":"1996-12-19T16:39:58.000000000","_key":"B","result":27.0} -{"_time":"1996-12-19T16:39:59.000000000","_key":"A","result":23.0} -{"_time":"1996-12-19T16:40:00.000000000","_key":"A","result":null} -{"_time":"1996-12-19T16:40:01.000000000","_key":"A","result":null} -{"_time":"1996-12-19T16:40:02.000000000","_key":"A","result":23.0} diff --git a/python/pytests/golden/udf_test/test_add_udf_pipe.jsonl b/python/pytests/golden/udf_test/test_add_udf_pipe.jsonl deleted file mode 100644 index 358c577c5..000000000 --- a/python/pytests/golden/udf_test/test_add_udf_pipe.jsonl +++ /dev/null @@ -1,6 +0,0 @@ -{"_time":"1996-12-19T16:39:57.000000000","_key":"A","result":15.0} -{"_time":"1996-12-19T16:39:58.000000000","_key":"B","result":27.0} -{"_time":"1996-12-19T16:39:59.000000000","_key":"A","result":23.0} -{"_time":"1996-12-19T16:40:00.000000000","_key":"A","result":null} -{"_time":"1996-12-19T16:40:01.000000000","_key":"A","result":null} -{"_time":"1996-12-19T16:40:02.000000000","_key":"A","result":null} diff --git a/python/src/execution.rs b/python/src/execution.rs index 19d3e0d46..569194ec9 100644 --- a/python/src/execution.rs +++ b/python/src/execution.rs @@ -56,18 +56,15 @@ impl Execution { fn collect_pyarrow(&mut self, py: Python<'_>) -> Result> { let execution = self.take_execution().unwrap(); - // Explicitly allow threads to take the gil during execution, so + // Explicitly allow threads to take the gil during execution, so // the udf evaluator can acquire it to execute python. - let batches = py.allow_threads(move || { - execution.collect_all_blocking() - })?; + let batches = py.allow_threads(move || execution.collect_all_blocking())?; let results = batches .into_iter() .map(|batch| batch.to_pyarrow(py)) .collect::>>()?; Ok(results) - // result.map_err(|_| error_stack::report!(ErrorContext::Python).into()) } fn next_pyarrow(&mut self, py: Python<'_>) -> Result> { diff --git a/python/src/udf.rs b/python/src/udf.rs index 65059a91b..39ddebddb 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -6,8 +6,8 @@ use arrow::array::{new_empty_array, ArrayData, ArrayRef}; use arrow::datatypes::DataType; use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::prelude::*; -use pyo3::types::PyTuple; -use sparrow_instructions::{Evaluator, RuntimeInfo, StaticInfo, ValueRef}; +use pyo3::types::{PyDict, PyTuple}; +use sparrow_instructions::{ColumnarValue, Evaluator, RuntimeInfo, StaticInfo, ValueRef}; use sparrow_syntax::{FeatureSetPart, Signature}; use uuid::Uuid; @@ -15,12 +15,13 @@ use uuid::Uuid; struct PyUdf { signature: Signature, callable: Py, - uuid: uuid::Uuid, + uuid: Uuid, } struct PyUdfEvaluator { args: Vec, callable: Py, + result_type: DataType, } #[derive(Clone)] @@ -39,11 +40,13 @@ impl sparrow_instructions::Udf for PyUdf { // is invokable on that. This would let us detect errors early. println!("Making python udf evaluator"); + let result_type = info.result_type.clone(); let args = info.args.iter().map(|arg| arg.value_ref.clone()).collect(); Box::new(PyUdfEvaluator { args, callable: self.callable.clone(), + result_type, }) } @@ -55,62 +58,83 @@ impl sparrow_instructions::Udf for PyUdf { impl Evaluator for PyUdfEvaluator { fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { println!("Evaluating Python UDF"); - let args = self + let inputs = self .args .iter() .map(|value_ref| -> anyhow::Result<_> { Ok(info.value(value_ref)?.array_ref()?) }) .try_collect()?; - let result = Python::with_gil(|py| -> anyhow::Result { - // todo: - println!("inside gil"); - Ok(self.evaluate_py(py, args)) - })?; - // FRAZ: handle python errors. + let result = + Python::with_gil(|py| -> anyhow::Result { self.evaluate_py(py, inputs) })?; + + anyhow::ensure!( + result.len() == info.num_rows(), + "result has wrong number of rows" + ); Ok(result) } } impl PyUdfEvaluator { - fn evaluate_py<'py>(&self, py: Python<'py>, args: Vec) -> ArrayRef { - // self.callable.call1(py, (args,)).unwrap().extract(py).unwrap(); - // let args: Vec = args - // .iter() - // .map::, _>(|value_ref| { - // let value: ColumnarValue = info.value(value_ref)?; - // let array_ref: ArrayRef = value.array_ref()?; - // let array_data: ArrayData = aray_ref.to_data(); - // let py_obj: PyObject = array_data.to_pyarrow(py)?; - - // Ok(py_obj) - // }) - // .collect::>()?; - - // let locals = [("args", args)].into_py_dict(py); - // py.run(&self.code, None, Some(&locals))?; - // let result_py: PyObject = locals.get_item("ret").unwrap().into(); - - // let result_array_data: ArrayData = ArrayData::from_pyarrow(result_py.as_ref(py))?; - - // // We can't control the returned type, but we can refuse to accept the "wrong" type. - // ensure!( - // *result_array_data.data_type() == self.result_type, - // "expected Python UDF to return type {:?}, but received {:?}", - // result_array_data.data_type(), - // self.result_type - // ); - - // let result_array_ref: ArrayRef = array::make_array(result_array_data); - - // // This is a "map" operation - each row should reflect the time of the - // // corresponding input row, and have the same length. - // ensure!( - // result_array_ref.len() == info.num_rows(), - // "unexpected result length from Python UDF" - // ); - - println!("array ref panic?"); - new_empty_array(&DataType::Int32) + fn evaluate_py<'py>(&self, py: Python<'py>, inputs: Vec) -> anyhow::Result { + // 1. Convert all arg inputs into pyarrow arrays then to python objects + let inputs: Vec = inputs + .iter() + .map::, _>(|arg_array| { + let array_data: ArrayData = arg_array.to_data(); + let py_obj: PyObject = array_data.to_pyarrow(py)?; + Ok(py_obj) + }) + .collect::>()?; + + // let asdf: Vec = vec![DataType::to_pyarrow(&self.result_type, py)?]; + // combine asdf and args + // let args: Vec = asdf.into_iter().chain(inputs.into_iter()).collect(); + + // let args = PyTuple::new(py, args); + + // 2. Call the python function + // Call the Python function + let result_ty = DataType::to_pyarrow(&self.result_type, py)?; + // let kwargs = PyDict::new(py); + // Set the 'result_type' key to a specific value + // kwargs.set_item("result_type", result_ty)?; + + let result_ty_vec: Vec = vec![result_ty]; + let inputs: Vec<_> = result_ty_vec.into_iter().chain(inputs).collect(); + let inputs = PyTuple::new(py, inputs); + + println!("CALLING CALLABLE "); + let py_result = self.callable.call1(py, inputs)?; + // let py_result = self.callable.call(py, inputs, Some(kwargs))?; + + ////////// + // let mut new_args: Vec = vec![DataType::to_pyarrow(&self.result_type, py)?]; + + // // Collect the other arguments + // for input in inputs.iter() { + // let array_data: ArrayData = input.to_data(); + // let py_obj: PyObject = array_data.to_pyarrow(py)?; + // new_args.push(py_obj); + // } + + // println!("New args: {:?}", new_args); + // let args = PyTuple::new(py, new_args); + // let py_result = self.callable.call(py, args, Some(kwargs))?; + + // 3. Convert the result from pyarrow array to arrow array + let result_array_data: ArrayData = ArrayData::from_pyarrow(py_result.as_ref(py))?; + + // We can't control the returned type, but we can refuse to accept the "wrong" type. + anyhow::ensure!( + *result_array_data.data_type() == self.result_type, + "expected Python UDF to return type {:?}, but received {:?}", + result_array_data.data_type(), + self.result_type + ); + + let result: ArrayRef = arrow::array::make_array(result_array_data); + Ok(result) } } From 7472216201075f5e52890824a12b06d03ad4ccde Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Thu, 24 Aug 2023 12:34:11 -0700 Subject: [PATCH 09/18] reusing func variable name was causing recursion in python lambda --- python/pysrc/kaskada/udf.py | 15 +++++++-------- python/src/udf.rs | 18 +++++++----------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index 32ba28ece..a598c6313 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -17,7 +17,7 @@ class Udf: def __init__(self, signature: str, func: Callable[..., pa.Array]) -> None: """Create a UDF with the given signature.""" - functools.update_wrapper(self, func) + # functools.update_wrapper(self, func) self._ffi = _ffi.Udf(signature, func) def __call__(self, *args: Timestream | Literal) -> Timestream: @@ -28,20 +28,19 @@ def __call__(self, *args: Timestream | Literal) -> Timestream: def udf(signature: str): """Decorate a function for use as a Kaskada UDF.""" - def decorator(func: FuncType): + def decorator(func_type: FuncType): # 1. Convert the `FuncType` to the type expected by Udf. # This needs to take the PyArrow result type and PyArrow inputs, # convert them to Pandas, call the function, and convert the result # back to PyArrow. - func = lambda result_type, *args: _converted_func(func, result_type, *args) - # func = lambda *args: _converted_func(func, pa.int64(), *args) + func = lambda result_type, *args: _converted_func(func_type, result_type, *args) # 2. Create the UDF object. return Udf(signature, func) return decorator -def _converted_func(func: FuncType, result_type: pa.DataType, *args: pa.Array) -> pa.Array: +def _converted_func(func_type: FuncType, result_type: pa.DataType, *args: pa.Array) -> pa.Array: """Run the function producing the given result type.""" # TODO: I believe this will return a series for simple arrays, and a # dataframe for struct arrays. We should explore how this handles @@ -51,9 +50,9 @@ def _converted_func(func: FuncType, result_type: pa.DataType, *args: pa.Array) - print(f"Args: ", args) print() - # pd_args = [arg.to_pandas() for arg in args] - # pd_result = func(*pd_args) - pd_result = func(*args) + pd_args = [arg.to_pandas() for arg in args] + pd_result = func_type(*pd_args) + # pd_result = func_type(*args) if isinstance(pd_result, pd.Series): return pa.Array.from_pandas(pd_result, type=result_type) diff --git a/python/src/udf.rs b/python/src/udf.rs index 39ddebddb..9de1d98fe 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -87,25 +87,21 @@ impl PyUdfEvaluator { }) .collect::>()?; - // let asdf: Vec = vec![DataType::to_pyarrow(&self.result_type, py)?]; - // combine asdf and args - // let args: Vec = asdf.into_iter().chain(inputs.into_iter()).collect(); - - // let args = PyTuple::new(py, args); - // 2. Call the python function - // Call the Python function let result_ty = DataType::to_pyarrow(&self.result_type, py)?; - // let kwargs = PyDict::new(py); - // Set the 'result_type' key to a specific value - // kwargs.set_item("result_type", result_ty)?; - let result_ty_vec: Vec = vec![result_ty]; let inputs: Vec<_> = result_ty_vec.into_iter().chain(inputs).collect(); let inputs = PyTuple::new(py, inputs); println!("CALLING CALLABLE "); let py_result = self.callable.call1(py, inputs)?; + + // let kwargs = PyDict::new(py); + // // Set the 'result_type' key to a specific value + // let result_ty = DataType::to_pyarrow(&self.result_type, py)?; + // kwargs.set_item("result_type", result_ty)?; + + // let inputs = PyTuple::new(py, inputs); // let py_result = self.callable.call(py, inputs, Some(kwargs))?; ////////// From 1c0f202e4644f4c309fd4d6c0f65e3b533ac506c Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Thu, 24 Aug 2023 17:30:57 -0700 Subject: [PATCH 10/18] clean up comments? --- crates/sparrow-runtime/src/execute.rs | 15 ++-- .../sparrow-runtime/src/execute/operation.rs | 1 - .../execute/operation/expression_executor.rs | 2 +- crates/sparrow-session/src/lib.rs | 2 - crates/sparrow-session/src/session.rs | 8 +-- python/Pipfile | 11 --- python/pysrc/kaskada/_result.py | 1 - python/pysrc/kaskada/_timestream.py | 69 ++----------------- python/pysrc/kaskada/udf.py | 11 +-- python/src/udf.rs | 42 +++-------- 10 files changed, 28 insertions(+), 134 deletions(-) delete mode 100644 python/Pipfile diff --git a/crates/sparrow-runtime/src/execute.rs b/crates/sparrow-runtime/src/execute.rs index 45630ad7d..6521447e3 100644 --- a/crates/sparrow-runtime/src/execute.rs +++ b/crates/sparrow-runtime/src/execute.rs @@ -24,7 +24,6 @@ use crate::execute::output::Destination; use crate::key_hash_inverse::{KeyHashInverse, ThreadSafeKeyHashInverse}; use crate::stores::ObjectStoreRegistry; use crate::RuntimeOptions; -pub use operation::WorkArea; mod checkpoints; mod compute_executor; @@ -344,14 +343,8 @@ pub async fn materialize( // as the materialization does not exit, and should not need to handle cleanup tasks that regular // queries do. We should likely refactor this to use a separate `materialize_with_progress` method. - // TODO: Materialize with udfs - execute_new( - plan, - destination, - data_context, - options, - None, - &HashMap::new(), - ) - .await + // TODO: Unimplemented feature - UDFs + let udfs = HashMap::new(); + + execute_new(plan, destination, data_context, options, None, &udfs).await } diff --git a/crates/sparrow-runtime/src/execute/operation.rs b/crates/sparrow-runtime/src/execute/operation.rs index 679f5b3af..d57eff0a6 100644 --- a/crates/sparrow-runtime/src/execute/operation.rs +++ b/crates/sparrow-runtime/src/execute/operation.rs @@ -68,7 +68,6 @@ use crate::execute::Error; use crate::key_hash_inverse::ThreadSafeKeyHashInverse; use crate::stores::ObjectStoreRegistry; use crate::Batch; -pub use expression_executor::WorkArea; /// Information used while creating operations. /// diff --git a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs index b8030d181..77e72f3bf 100644 --- a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs +++ b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs @@ -274,7 +274,7 @@ impl ExpressionExecutor { } /// Wrapper around the columns produced during execution. -pub struct WorkArea { +struct WorkArea { time: ArrayRef, subsort: ArrayRef, key_hash: ArrayRef, diff --git a/crates/sparrow-session/src/lib.rs b/crates/sparrow-session/src/lib.rs index a06a88662..bbc72adef 100644 --- a/crates/sparrow-session/src/lib.rs +++ b/crates/sparrow-session/src/lib.rs @@ -9,5 +9,3 @@ pub use execution::Execution; pub use expr::{Expr, Literal}; pub use session::{ExecutionOptions, Session}; pub use table::Table; - -// pub type Udf = dyn sparrow_instructions::Udf; diff --git a/crates/sparrow-session/src/session.rs b/crates/sparrow-session/src/session.rs index 46cac7d28..22c95d1db 100644 --- a/crates/sparrow-session/src/session.rs +++ b/crates/sparrow-session/src/session.rs @@ -27,10 +27,10 @@ pub struct Session { key_hash_inverse: HashMap>, /// Keeps track of the uuid mapping. /// - /// This is a bit of a hack that allows to serialize the uuid instead of the - /// `Arc>, } diff --git a/python/Pipfile b/python/Pipfile deleted file mode 100644 index 71e4f7cbf..000000000 --- a/python/Pipfile +++ /dev/null @@ -1,11 +0,0 @@ -[[source]] -url = "https://pypi.org/simple" -verify_ssl = true -name = "pypi" - -[packages] - -[dev-packages] - -[requires] -python_version = "3.9" diff --git a/python/pysrc/kaskada/_result.py b/python/pysrc/kaskada/_result.py index 4377a5dcf..eea4e04ac 100644 --- a/python/pysrc/kaskada/_result.py +++ b/python/pysrc/kaskada/_result.py @@ -46,7 +46,6 @@ def to_pyarrow(self) -> pa.Table: This method will block on the complete results of the query and collect all results into memory. If this is not desired, use `iter_pyarrow` instead. """ - print("Converting to_pyarrow") batches = self._ffi_execution.collect_pyarrow() if len(batches) == 0: return pa.Table.from_batches([], schema=pa.schema([])) diff --git a/python/pysrc/kaskada/_timestream.py b/python/pysrc/kaskada/_timestream.py index 1649b54c0..6dd437738 100644 --- a/python/pysrc/kaskada/_timestream.py +++ b/python/pysrc/kaskada/_timestream.py @@ -4,7 +4,6 @@ import sys import warnings -import inspect from datetime import datetime from datetime import timedelta @@ -67,66 +66,9 @@ def _literal(value: Literal, session: _ffi.Session) -> Timestream: raise TypeError("Cannot create a literal Timestream from a datetime") return Timestream(_ffi.Expr.literal(session, value)) - @staticmethod - def _call_udf( - udf: _ffi.Udf, - *args: Union[Timestream, Literal], - session: Optional[_ffi.Session] = None, - ) -> Timestream: - """ - Construct a new Timestream by calling the given function. - - Parameters - ---------- - func : str - Name of the function to apply. - *args : Timestream | int | str | float | bool | None - List of arguments to the expression. - session : FFI Session - FFI Session to create the expression in. - If unspecified, will infer from the arguments. - Will fail if all arguments are literals and the session is not provided. - - Returns - ------- - Timestream - Timestream representing the result of the function applied to the arguments. - - Raises - ------ - # noqa: DAR401 _augment_error - TypeError - If the argument types are invalid for the given function. - ValueError - If the argument values are invalid for the given function. - """ - if session is None: - session = next( - arg._ffi_expr.session() for arg in args if isinstance(arg, Timestream) - ) - - def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: - if isinstance(arg, Timestream): - return arg._ffi_expr - else: - return Timestream._literal(arg, session)._ffi_expr - - ffi_args = [make_arg(arg) for arg in args] - - try: - return Timestream( - _ffi.Expr.call_udf(session=session, udf=udf, args=ffi_args) - ) - except TypeError as e: - # noqa: DAR401 - raise _augment_error(args, TypeError(str(e))) from e - except ValueError as e: - raise _augment_error(args, ValueError(str(e))) from e - - @staticmethod def _call( - func: str, + func: Union[str, _ffi.Udf], *args: Union[Timestream, Literal], session: Optional[_ffi.Session] = None, ) -> Timestream: @@ -170,9 +112,12 @@ def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: ffi_args = [make_arg(arg) for arg in args] try: - return Timestream( - _ffi.Expr.call(session=session, operation=func, args=ffi_args) - ) + if isinstance(func, str): + return Timestream(_ffi.Expr.call(session=session, operation=func, args=ffi_args)) + elif isinstance(func, _ffi.Udf): + return Timestream(_ffi.Expr.call_udf(session=session, udf=func, args=ffi_args)) + else: + raise TypeError(f"invalid type for func. Expected str or udf, saw: {type(func)}") except TypeError as e: # noqa: DAR401 raise _augment_error(args, TypeError(str(e))) from e diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index a598c6313..f2327bcd0 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -17,13 +17,12 @@ class Udf: def __init__(self, signature: str, func: Callable[..., pa.Array]) -> None: """Create a UDF with the given signature.""" - # functools.update_wrapper(self, func) + functools.update_wrapper(self, func) self._ffi = _ffi.Udf(signature, func) def __call__(self, *args: Timestream | Literal) -> Timestream: """Apply the UDF to the given arguments.""" - print("Calling udf") - return Timestream._call_udf(self._ffi, *args) + return Timestream._call(self._ffi, *args) def udf(signature: str): """Decorate a function for use as a Kaskada UDF.""" @@ -45,14 +44,8 @@ def _converted_func(func_type: FuncType, result_type: pa.DataType, *args: pa.Arr # TODO: I believe this will return a series for simple arrays, and a # dataframe for struct arrays. We should explore how this handles # different types. - print("CONVERTING FUNC") - print(f"Result type: ", result_type) - print(f"Args: ", args) - print() - pd_args = [arg.to_pandas() for arg in args] pd_result = func_type(*pd_args) - # pd_result = func_type(*args) if isinstance(pd_result, pd.Series): return pa.Array.from_pandas(pd_result, type=result_type) diff --git a/python/src/udf.rs b/python/src/udf.rs index 9de1d98fe..9574d7744 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -87,46 +87,26 @@ impl PyUdfEvaluator { }) .collect::>()?; - // 2. Call the python function - let result_ty = DataType::to_pyarrow(&self.result_type, py)?; - let result_ty_vec: Vec = vec![result_ty]; - let inputs: Vec<_> = result_ty_vec.into_iter().chain(inputs).collect(); - let inputs = PyTuple::new(py, inputs); + // The callable expects [result_type, *args] + let mut py_inputs = vec![DataType::to_pyarrow(&self.result_type, py)?]; + py_inputs.extend(inputs); - println!("CALLING CALLABLE "); - let py_result = self.callable.call1(py, inputs)?; - - // let kwargs = PyDict::new(py); - // // Set the 'result_type' key to a specific value - // let result_ty = DataType::to_pyarrow(&self.result_type, py)?; - // kwargs.set_item("result_type", result_ty)?; - - // let inputs = PyTuple::new(py, inputs); - // let py_result = self.callable.call(py, inputs, Some(kwargs))?; - - ////////// - // let mut new_args: Vec = vec![DataType::to_pyarrow(&self.result_type, py)?]; + // let inputs: Vec<_> = result_ty_vec.into_iter().chain(inputs).collect(); + let inputs = PyTuple::new(py, py_inputs); - // // Collect the other arguments - // for input in inputs.iter() { - // let array_data: ArrayData = input.to_data(); - // let py_obj: PyObject = array_data.to_pyarrow(py)?; - // new_args.push(py_obj); - // } - - // println!("New args: {:?}", new_args); - // let args = PyTuple::new(py, new_args); - // let py_result = self.callable.call(py, args, Some(kwargs))?; + // 3. Execute the udf + let py_result = self.callable.call1(py, inputs)?; // 3. Convert the result from pyarrow array to arrow array let result_array_data: ArrayData = ArrayData::from_pyarrow(py_result.as_ref(py))?; - // We can't control the returned type, but we can refuse to accept the "wrong" type. + // We attempt to coerce the return type into our expected result type, but this + // check exists to ensure the roundtrip type is as expected. anyhow::ensure!( *result_array_data.data_type() == self.result_type, "expected Python UDF to return type {:?}, but received {:?}", - result_array_data.data_type(), self.result_type + result_array_data.data_type(), ); let result: ArrayRef = arrow::array::make_array(result_array_data); @@ -139,9 +119,7 @@ impl Udf { #[new] #[pyo3(signature = (signature, callable))] fn new(signature: String, callable: Py) -> Self { - println!("Creating udf with signature: {:?}", signature); let uuid = Uuid::new_v4(); - // TODO: sig name string cannot be &'static str let signature = Signature::try_from_str(FeatureSetPart::Function("udf"), &signature) .expect("signature to parse"); let udf = PyUdf { From 1e356d2230a4728bdc78817041c56ce4a0fd0d6c Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Thu, 24 Aug 2023 20:07:24 -0700 Subject: [PATCH 11/18] Add another udf for unit test --- python/pysrc/kaskada/__init__.py | 4 ++- python/pysrc/kaskada/_ffi.pyi | 7 +---- python/pysrc/kaskada/_timestream.py | 13 ++++++--- python/pysrc/kaskada/udf.py | 14 ++++++--- .../golden/udf_test/test_add_udf_direct.jsonl | 6 ++++ .../golden/udf_test/test_add_udf_pipe.jsonl | 6 ++++ python/pytests/udf_test.py | 29 +++++++++++++++---- python/src/udf.rs | 2 +- 8 files changed, 60 insertions(+), 21 deletions(-) create mode 100644 python/pytests/golden/udf_test/test_add_udf_direct.jsonl create mode 100644 python/pytests/golden/udf_test/test_add_udf_pipe.jsonl diff --git a/python/pysrc/kaskada/__init__.py b/python/pysrc/kaskada/__init__.py index 5c2d73fe5..620bb8233 100644 --- a/python/pysrc/kaskada/__init__.py +++ b/python/pysrc/kaskada/__init__.py @@ -10,8 +10,9 @@ from ._timestream import Literal from ._timestream import Timestream from ._timestream import record -from .udf import Udf from .udf import udf +from .udf import Udf + __all__ = [ "ExecutionOptions", @@ -22,6 +23,7 @@ "Result", "sources", "Timestream", + "Udf", "udf", "windows", ] diff --git a/python/pysrc/kaskada/_ffi.pyi b/python/pysrc/kaskada/_ffi.pyi index 80423ba4c..07791114b 100644 --- a/python/pysrc/kaskada/_ffi.pyi +++ b/python/pysrc/kaskada/_ffi.pyi @@ -41,7 +41,6 @@ class Expr: def session(self) -> Session: ... def execute(self, options: Optional[ExecutionOptions] = None) -> Execution: ... def grouping(self) -> Optional[str]: ... - class Table(Expr): def __init__( @@ -60,8 +59,4 @@ class Table(Expr): def add_pyarrow(self, data: pa.RecordBatch) -> None: ... class Udf(object): - def __init__( - self, - result_ty: str, - result_fn: Callable[..., pa.Array] - ) -> None: ... \ No newline at end of file + def __init__(self, result_ty: str, result_fn: Callable[..., pa.Array]) -> None: ... diff --git a/python/pysrc/kaskada/_timestream.py b/python/pysrc/kaskada/_timestream.py index 6dd437738..fc9c92bce 100644 --- a/python/pysrc/kaskada/_timestream.py +++ b/python/pysrc/kaskada/_timestream.py @@ -4,7 +4,6 @@ import sys import warnings - from datetime import datetime from datetime import timedelta from typing import Callable @@ -113,11 +112,17 @@ def make_arg(arg: Union[Timestream, Literal]) -> _ffi.Expr: ffi_args = [make_arg(arg) for arg in args] try: if isinstance(func, str): - return Timestream(_ffi.Expr.call(session=session, operation=func, args=ffi_args)) + return Timestream( + _ffi.Expr.call(session=session, operation=func, args=ffi_args) + ) elif isinstance(func, _ffi.Udf): - return Timestream(_ffi.Expr.call_udf(session=session, udf=func, args=ffi_args)) + return Timestream( + _ffi.Expr.call_udf(session=session, udf=func, args=ffi_args) + ) else: - raise TypeError(f"invalid type for func. Expected str or udf, saw: {type(func)}") + raise TypeError( + f"invalid type for func. Expected str or udf, saw: {type(func)}" + ) except TypeError as e: # noqa: DAR401 raise _augment_error(args, TypeError(str(e))) from e diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index f2327bcd0..febccb9a7 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -7,13 +7,15 @@ import pandas as pd import pyarrow as pa -from ._timestream import Timestream -from ._timestream import Literal from . import _ffi +from ._timestream import Literal +from ._timestream import Timestream + # TODO: Allow functions to return `pd.DataFrame` for struct arrays. FuncType = Callable[..., pd.Series] + class Udf: def __init__(self, signature: str, func: Callable[..., pa.Array]) -> None: """Create a UDF with the given signature.""" @@ -24,6 +26,7 @@ def __call__(self, *args: Timestream | Literal) -> Timestream: """Apply the UDF to the given arguments.""" return Timestream._call(self._ffi, *args) + def udf(signature: str): """Decorate a function for use as a Kaskada UDF.""" @@ -39,7 +42,10 @@ def decorator(func_type: FuncType): return decorator -def _converted_func(func_type: FuncType, result_type: pa.DataType, *args: pa.Array) -> pa.Array: + +def _converted_func( + func_type: FuncType, result_type: pa.DataType, *args: pa.Array +) -> pa.Array: """Run the function producing the given result type.""" # TODO: I believe this will return a series for simple arrays, and a # dataframe for struct arrays. We should explore how this handles @@ -50,4 +56,4 @@ def _converted_func(func_type: FuncType, result_type: pa.DataType, *args: pa.Arr if isinstance(pd_result, pd.Series): return pa.Array.from_pandas(pd_result, type=result_type) else: - raise TypeError(f"Unsupported result type: {type(pd_result)}") \ No newline at end of file + raise TypeError(f"Unsupported result type: {type(pd_result)}") diff --git a/python/pytests/golden/udf_test/test_add_udf_direct.jsonl b/python/pytests/golden/udf_test/test_add_udf_direct.jsonl new file mode 100644 index 000000000..e1314e28a --- /dev/null +++ b/python/pytests/golden/udf_test/test_add_udf_direct.jsonl @@ -0,0 +1,6 @@ +{"_time":"1996-12-19T16:39:57.000000000","_key":"A","m":5.0,"n":10.0,"add":15.0,"add_x2":30.0} +{"_time":"1996-12-19T16:39:58.000000000","_key":"B","m":24.0,"n":3.0,"add":27.0,"add_x2":54.0} +{"_time":"1996-12-19T16:39:59.000000000","_key":"A","m":17.0,"n":6.0,"add":23.0,"add_x2":46.0} +{"_time":"1996-12-19T16:40:00.000000000","_key":"A","m":null,"n":9.0,"add":null,"add_x2":null} +{"_time":"1996-12-19T16:40:01.000000000","_key":"A","m":12.0,"n":null,"add":null,"add_x2":null} +{"_time":"1996-12-19T16:40:02.000000000","_key":"A","m":null,"n":null,"add":null,"add_x2":null} diff --git a/python/pytests/golden/udf_test/test_add_udf_pipe.jsonl b/python/pytests/golden/udf_test/test_add_udf_pipe.jsonl new file mode 100644 index 000000000..e1314e28a --- /dev/null +++ b/python/pytests/golden/udf_test/test_add_udf_pipe.jsonl @@ -0,0 +1,6 @@ +{"_time":"1996-12-19T16:39:57.000000000","_key":"A","m":5.0,"n":10.0,"add":15.0,"add_x2":30.0} +{"_time":"1996-12-19T16:39:58.000000000","_key":"B","m":24.0,"n":3.0,"add":27.0,"add_x2":54.0} +{"_time":"1996-12-19T16:39:59.000000000","_key":"A","m":17.0,"n":6.0,"add":23.0,"add_x2":46.0} +{"_time":"1996-12-19T16:40:00.000000000","_key":"A","m":null,"n":9.0,"add":null,"add_x2":null} +{"_time":"1996-12-19T16:40:01.000000000","_key":"A","m":12.0,"n":null,"add":null,"add_x2":null} +{"_time":"1996-12-19T16:40:02.000000000","_key":"A","m":null,"n":null,"add":null,"add_x2":null} diff --git a/python/pytests/udf_test.py b/python/pytests/udf_test.py index f58fe906b..366b51bd5 100644 --- a/python/pytests/udf_test.py +++ b/python/pytests/udf_test.py @@ -1,23 +1,31 @@ -import pandas as pd -import pyarrow as pa import kaskada as kd import kaskada._ffi as _ffi +import pandas as pd import pytest + @kd.udf("add(x: N, y: N) -> N") def add(x: pd.Series, y: pd.Series) -> pd.Series: """Use Pandas to add two numbers.""" return x + y +@kd.udf("add_x2(x: N, y: N) -> N") +def add_x2(x: pd.Series, y: pd.Series) -> pd.Series: + """Use Pandas to add and multiple""" + return (x + y) * 2 + + +@pytest.mark.skip(reason="currently not working") def test_docstring() -> None: - assert add.__doc__ == "Use Pandas to add two numbers." + assert add.__doc__ == "Use Pandas to add two numbers." def test_udf_instance() -> None: assert isinstance(add, kd.Udf) assert isinstance(add._ffi, _ffi.Udf) + @pytest.fixture def source_int64() -> kd.sources.CsvString: content = "\n".join( @@ -33,12 +41,23 @@ def source_int64() -> kd.sources.CsvString: ) return kd.sources.CsvString(content, time_column_name="time", key_column_name="key") + def test_add_udf_direct(golden, source_int64) -> None: m = source_int64.col("m") n = source_int64.col("n") - golden.jsonl(add(m, n)) + golden.jsonl(kd.record({"m": m, "n": n, "add": add(m, n), "add_x2": add_x2(m, n)})) + def test_add_udf_pipe(golden, source_int64) -> None: m = source_int64.col("m") n = source_int64.col("n") - golden.jsonl(m.pipe(add, n)) + golden.jsonl( + kd.record( + { + "m": m, + "n": n, + "add": m.pipe(add, n), + "add_x2": m.pipe(add_x2, n), + } + ) + ) diff --git a/python/src/udf.rs b/python/src/udf.rs index 9574d7744..f58e5a773 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -105,7 +105,7 @@ impl PyUdfEvaluator { anyhow::ensure!( *result_array_data.data_type() == self.result_type, "expected Python UDF to return type {:?}, but received {:?}", - self.result_type + self.result_type, result_array_data.data_type(), ); From e4149af9e0bff6bef7ad314974d7e672214754fb Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Thu, 24 Aug 2023 20:12:10 -0700 Subject: [PATCH 12/18] remove prints --- crates/sparrow-compiler/src/ast_to_dfg.rs | 1 - crates/sparrow-compiler/src/plan/expression_to_plan.rs | 5 +---- crates/sparrow-instructions/src/evaluators.rs | 5 +---- python/src/expr.rs | 3 +-- python/src/udf.rs | 8 +++----- 5 files changed, 6 insertions(+), 16 deletions(-) diff --git a/crates/sparrow-compiler/src/ast_to_dfg.rs b/crates/sparrow-compiler/src/ast_to_dfg.rs index 84be28e3a..c43ef25ab 100644 --- a/crates/sparrow-compiler/src/ast_to_dfg.rs +++ b/crates/sparrow-compiler/src/ast_to_dfg.rs @@ -104,7 +104,6 @@ pub fn add_udf_to_dfg( data_context: &mut DataContext, diagnostics: &mut DiagnosticCollector<'_>, ) -> anyhow::Result { - println!("sparrow_compiler::add_udf_to_dfg"); let argument_types = arguments.transform(|i| i.with_value(i.value_type().clone())); let signature = udf.signature(); diff --git a/crates/sparrow-compiler/src/plan/expression_to_plan.rs b/crates/sparrow-compiler/src/plan/expression_to_plan.rs index 99495e4fe..c3beb3a24 100644 --- a/crates/sparrow-compiler/src/plan/expression_to_plan.rs +++ b/crates/sparrow-compiler/src/plan/expression_to_plan.rs @@ -35,10 +35,7 @@ pub(super) fn dfg_to_plan( InstKind::FieldRef => "field_ref".to_owned(), InstKind::Record => "record".to_owned(), InstKind::Cast(_) => "cast".to_owned(), - InstKind::Udf(udf) => { - println!("dfg_to_plan udf: {:?}", udf.signature().name()); - udf.uuid().to_string() - } + InstKind::Udf(udf) => udf.uuid().to_string(), }; let result_type = diff --git a/crates/sparrow-instructions/src/evaluators.rs b/crates/sparrow-instructions/src/evaluators.rs index d0cc6950f..f5c84248a 100644 --- a/crates/sparrow-instructions/src/evaluators.rs +++ b/crates/sparrow-instructions/src/evaluators.rs @@ -160,10 +160,7 @@ pub fn create_evaluator(info: StaticInfo<'_>) -> anyhow::Result Ok(RecordEvaluator::try_new(info)?), - InstKind::Udf(udf) => { - println!("Making udf evaluator"); - Ok(udf.make_evaluator(info)) - } + InstKind::Udf(udf) => Ok(udf.make_evaluator(info)), } } diff --git a/python/src/expr.rs b/python/src/expr.rs index d0f5111ca..17deccf64 100644 --- a/python/src/expr.rs +++ b/python/src/expr.rs @@ -47,11 +47,10 @@ impl Expr { /// Create a new udf. /// - /// This creates a new udf based on the `udf` and `args` provided. + /// This creates a new expression based on the `udf` and `args` provided. #[staticmethod] #[pyo3(signature = (session, udf, args))] fn call_udf(session: Session, udf: Udf, args: Vec) -> PyResult { - println!("SparrowExpr::call_udf"); let sparrow_udf = udf.0; if !args.iter().all(|e| e.session() == session) { return Err(PyValueError::new_err( diff --git a/python/src/udf.rs b/python/src/udf.rs index f58e5a773..d42cf1424 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -26,7 +26,6 @@ struct PyUdfEvaluator { #[derive(Clone)] #[pyclass] -// TODO: Does it need to extend Expr for some reason? pub(crate) struct Udf(pub Arc); impl sparrow_instructions::Udf for PyUdf { @@ -34,12 +33,12 @@ impl sparrow_instructions::Udf for PyUdf { &self.signature } - // FRAZ: Should we allow this to return an error? fn make_evaluator(&self, info: StaticInfo<'_>) -> Box { - // TODO: Idea: Create a random value of each argument type and verify that the callbale + // Idea: Create a random value of each argument type and verify that the callbale // is invokable on that. This would let us detect errors early. + // + // Though, it still wouldn't give us compile-time errors. - println!("Making python udf evaluator"); let result_type = info.result_type.clone(); let args = info.args.iter().map(|arg| arg.value_ref.clone()).collect(); @@ -57,7 +56,6 @@ impl sparrow_instructions::Udf for PyUdf { impl Evaluator for PyUdfEvaluator { fn evaluate(&mut self, info: &dyn RuntimeInfo) -> anyhow::Result { - println!("Evaluating Python UDF"); let inputs = self .args .iter() From be6ded4feddfeaa6e25bc5670c49547f90232389 Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Thu, 24 Aug 2023 20:26:44 -0700 Subject: [PATCH 13/18] linting --- python/pysrc/kaskada/__init__.py | 2 +- python/pysrc/kaskada/udf.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/pysrc/kaskada/__init__.py b/python/pysrc/kaskada/__init__.py index 620bb8233..b4d5db186 100644 --- a/python/pysrc/kaskada/__init__.py +++ b/python/pysrc/kaskada/__init__.py @@ -10,8 +10,8 @@ from ._timestream import Literal from ._timestream import Timestream from ._timestream import record -from .udf import udf from .udf import Udf +from .udf import udf __all__ = [ diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index febccb9a7..36c0a6d14 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -17,6 +17,8 @@ class Udf: + """User defined function.""" + def __init__(self, signature: str, func: Callable[..., pa.Array]) -> None: """Create a UDF with the given signature.""" functools.update_wrapper(self, func) @@ -35,7 +37,8 @@ def decorator(func_type: FuncType): # This needs to take the PyArrow result type and PyArrow inputs, # convert them to Pandas, call the function, and convert the result # back to PyArrow. - func = lambda result_type, *args: _converted_func(func_type, result_type, *args) + def func(result_type, *args): + _converted_func(func_type, result_type, *args) # 2. Create the UDF object. return Udf(signature, func) From e291e1f451401bc92255ae1b0473b211689812f6 Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Fri, 25 Aug 2023 10:21:54 -0700 Subject: [PATCH 14/18] Linting / print remove --- .../src/execute/operation/expression_executor.rs | 1 - python/pysrc/kaskada/udf.py | 1 + python/pytests/udf_test.py | 2 +- python/src/expr.rs | 1 - python/src/udf.rs | 6 +++--- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs index 77e72f3bf..776821808 100644 --- a/crates/sparrow-runtime/src/execute/operation/expression_executor.rs +++ b/crates/sparrow-runtime/src/execute/operation/expression_executor.rs @@ -98,7 +98,6 @@ impl ExpressionExecutor { Operator::Instruction(inst) => { // TODO: This could probably be cleaned up, possibly by eliminating // `inst kind` entirely. - println!("Inst: {:?}", inst); let inst_kind = if inst == "field_ref" { InstKind::FieldRef } else if inst == "record" { diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index 36c0a6d14..76a5f19a2 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -56,6 +56,7 @@ def _converted_func( pd_args = [arg.to_pandas() for arg in args] pd_result = func_type(*pd_args) + print("Result type: ", result_type) if isinstance(pd_result, pd.Series): return pa.Array.from_pandas(pd_result, type=result_type) else: diff --git a/python/pytests/udf_test.py b/python/pytests/udf_test.py index 366b51bd5..f05b4e4b1 100644 --- a/python/pytests/udf_test.py +++ b/python/pytests/udf_test.py @@ -12,7 +12,7 @@ def add(x: pd.Series, y: pd.Series) -> pd.Series: @kd.udf("add_x2(x: N, y: N) -> N") def add_x2(x: pd.Series, y: pd.Series) -> pd.Series: - """Use Pandas to add and multiple""" + """Use Pandas to add then multiply by 2""" return (x + y) * 2 diff --git a/python/src/expr.rs b/python/src/expr.rs index 17deccf64..5c427305d 100644 --- a/python/src/expr.rs +++ b/python/src/expr.rs @@ -7,7 +7,6 @@ use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; use sparrow_session::{Expr as RustExpr, Literal, Session as RustSession}; -use std::sync::Arc; /// Kaskada expression node. #[derive(Clone)] diff --git a/python/src/udf.rs b/python/src/udf.rs index d42cf1424..9c30d0c0d 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -2,12 +2,12 @@ use itertools::Itertools; use std::fmt::Debug; use std::sync::Arc; -use arrow::array::{new_empty_array, ArrayData, ArrayRef}; +use arrow::array::{ArrayData, ArrayRef}; use arrow::datatypes::DataType; use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::prelude::*; -use pyo3::types::{PyDict, PyTuple}; -use sparrow_instructions::{ColumnarValue, Evaluator, RuntimeInfo, StaticInfo, ValueRef}; +use pyo3::types::PyTuple; +use sparrow_instructions::{Evaluator, RuntimeInfo, StaticInfo, ValueRef}; use sparrow_syntax::{FeatureSetPart, Signature}; use uuid::Uuid; From 05bdfaf62d416e1b0f6a353d58f7a9766979b9e6 Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Fri, 25 Aug 2023 10:51:58 -0700 Subject: [PATCH 15/18] cleanup comments --- python/src/udf.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/src/udf.rs b/python/src/udf.rs index 9c30d0c0d..e0be6c650 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -75,7 +75,7 @@ impl Evaluator for PyUdfEvaluator { impl PyUdfEvaluator { fn evaluate_py<'py>(&self, py: Python<'py>, inputs: Vec) -> anyhow::Result { - // 1. Convert all arg inputs into pyarrow arrays then to python objects + // Convert all arg inputs into pyarrow arrays then to python objects let inputs: Vec = inputs .iter() .map::, _>(|arg_array| { @@ -89,13 +89,11 @@ impl PyUdfEvaluator { let mut py_inputs = vec![DataType::to_pyarrow(&self.result_type, py)?]; py_inputs.extend(inputs); - // let inputs: Vec<_> = result_ty_vec.into_iter().chain(inputs).collect(); let inputs = PyTuple::new(py, py_inputs); - // 3. Execute the udf + // Execute the python udf let py_result = self.callable.call1(py, inputs)?; - // 3. Convert the result from pyarrow array to arrow array let result_array_data: ArrayData = ArrayData::from_pyarrow(py_result.as_ref(py))?; // We attempt to coerce the return type into our expected result type, but this From 23a9c39f8243891b9973e76ca2911a5ca243eef6 Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Fri, 25 Aug 2023 11:12:48 -0700 Subject: [PATCH 16/18] Actually return from the udf --- crates/sparrow-runtime/src/execute/compute_executor.rs | 3 ++- crates/sparrow-runtime/src/execute/operation.rs | 1 + crates/sparrow-session/src/session.rs | 2 +- python/pysrc/kaskada/udf.py | 2 +- python/src/udf.rs | 4 ++-- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/crates/sparrow-runtime/src/execute/compute_executor.rs b/crates/sparrow-runtime/src/execute/compute_executor.rs index fb63354e9..7faf111a9 100644 --- a/crates/sparrow-runtime/src/execute/compute_executor.rs +++ b/crates/sparrow-runtime/src/execute/compute_executor.rs @@ -51,6 +51,7 @@ pub struct ComputeResult { impl ComputeExecutor { /// Spawns the compute tasks using the new operation based executor. + #[allow(clippy::too_many_arguments)] pub async fn try_spawn( mut context: OperationContext, plan_hash: PlanHash, @@ -151,7 +152,7 @@ impl ComputeExecutor { max_event_time_tx.clone(), late_bindings, stop_signal_rx.clone(), - &udfs, + udfs, ) .await?, ); diff --git a/crates/sparrow-runtime/src/execute/operation.rs b/crates/sparrow-runtime/src/execute/operation.rs index d57eff0a6..b642d0714 100644 --- a/crates/sparrow-runtime/src/execute/operation.rs +++ b/crates/sparrow-runtime/src/execute/operation.rs @@ -169,6 +169,7 @@ impl OperationExecutor { /// This function will not return until the operation is complete -- either /// because all inputs have been processed or because it has been /// interrupted due to an error or reaching a preview-rows limit, etc. + #[allow(clippy::too_many_arguments)] pub async fn execute( self, operation_index: usize, diff --git a/crates/sparrow-session/src/session.rs b/crates/sparrow-session/src/session.rs index 22c95d1db..e1f1cb908 100644 --- a/crates/sparrow-session/src/session.rs +++ b/crates/sparrow-session/src/session.rs @@ -340,7 +340,7 @@ impl Session { .collect(); Err(Error::Errors(errors))? } else { - self.udfs.insert(udf.uuid().clone(), udf.clone()); + self.udfs.insert(*udf.uuid(), udf.clone()); Ok(Expr(result)) } } diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index 76a5f19a2..8d7058257 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -38,7 +38,7 @@ def decorator(func_type: FuncType): # convert them to Pandas, call the function, and convert the result # back to PyArrow. def func(result_type, *args): - _converted_func(func_type, result_type, *args) + return _converted_func(func_type, result_type, *args) # 2. Create the UDF object. return Udf(signature, func) diff --git a/python/src/udf.rs b/python/src/udf.rs index e0be6c650..0b8634ab3 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -59,7 +59,7 @@ impl Evaluator for PyUdfEvaluator { let inputs = self .args .iter() - .map(|value_ref| -> anyhow::Result<_> { Ok(info.value(value_ref)?.array_ref()?) }) + .map(|value_ref| -> anyhow::Result<_> { info.value(value_ref)?.array_ref() }) .try_collect()?; let result = @@ -74,7 +74,7 @@ impl Evaluator for PyUdfEvaluator { } impl PyUdfEvaluator { - fn evaluate_py<'py>(&self, py: Python<'py>, inputs: Vec) -> anyhow::Result { + fn evaluate_py(&self, py: Python<'_>, inputs: Vec) -> anyhow::Result { // Convert all arg inputs into pyarrow arrays then to python objects let inputs: Vec = inputs .iter() From 0683b00630aaa7fa20daf61be1abf9163117d51e Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Fri, 25 Aug 2023 12:37:10 -0700 Subject: [PATCH 17/18] mypy and cargo test --- crates/sparrow-runtime/src/execute/operation/scan.rs | 2 ++ crates/sparrow-runtime/src/execute/operation/testing.rs | 3 +++ python/pysrc/kaskada/_ffi.pyi | 1 - 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/sparrow-runtime/src/execute/operation/scan.rs b/crates/sparrow-runtime/src/execute/operation/scan.rs index 1c87e848a..a1f2ae204 100644 --- a/crates/sparrow-runtime/src/execute/operation/scan.rs +++ b/crates/sparrow-runtime/src/execute/operation/scan.rs @@ -361,6 +361,7 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; use arrow::record_batch::RecordBatch; use futures::StreamExt; + use hashbrown::HashMap; use itertools::Itertools; use sparrow_api::kaskada::v1alpha::compute_table::FileSet; use sparrow_api::kaskada::v1alpha::operation_input_ref::{self, Column}; @@ -519,6 +520,7 @@ mod tests { max_event_tx, &Default::default(), None, + &HashMap::new(), ) .await .unwrap() diff --git a/crates/sparrow-runtime/src/execute/operation/testing.rs b/crates/sparrow-runtime/src/execute/operation/testing.rs index be209ecd5..0a89feb7d 100644 --- a/crates/sparrow-runtime/src/execute/operation/testing.rs +++ b/crates/sparrow-runtime/src/execute/operation/testing.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::Context; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; +use hashbrown::HashMap; use itertools::Itertools; use sparrow_api::kaskada::v1alpha::{ComputePlan, OperationPlan}; use sparrow_compiler::DataContext; @@ -198,6 +199,7 @@ pub(super) async fn run_operation( max_event_tx, &Default::default(), None, + &HashMap::new(), ) .await .unwrap() @@ -255,6 +257,7 @@ pub(super) async fn run_operation_json( max_event_tx, &Default::default(), None, + &HashMap::new(), ) .await .unwrap() diff --git a/python/pysrc/kaskada/_ffi.pyi b/python/pysrc/kaskada/_ffi.pyi index 07791114b..e05f87856 100644 --- a/python/pysrc/kaskada/_ffi.pyi +++ b/python/pysrc/kaskada/_ffi.pyi @@ -6,7 +6,6 @@ from typing import Sequence import pyarrow as pa from ._execution import ExecutionOptions -from .udf import Udf class Session: def __init__(self) -> None: ... From 5ebc7e7a28cfecc46eee6f32c9d465ba14c243dd Mon Sep 17 00:00:00 2001 From: Jordan Frazier Date: Mon, 28 Aug 2023 11:04:04 -0700 Subject: [PATCH 18/18] Move udfs into operation context --- crates/sparrow-runtime/src/execute.rs | 8 ++++---- .../src/execute/compute_executor.rs | 5 ----- crates/sparrow-runtime/src/execute/operation.rs | 6 +++--- .../src/execute/operation/scan.rs | 2 +- .../src/execute/operation/testing.rs | 4 ++-- crates/sparrow-session/src/session.rs | 2 +- python/.vscode/settings.json | 3 +++ python/pysrc/kaskada/__init__.py | 2 -- python/pysrc/kaskada/udf.py | 17 ++++++++++------- python/pytests/udf_test.py | 12 ++++++------ 10 files changed, 30 insertions(+), 31 deletions(-) create mode 100644 python/.vscode/settings.json diff --git a/crates/sparrow-runtime/src/execute.rs b/crates/sparrow-runtime/src/execute.rs index 6521447e3..37b8f33ef 100644 --- a/crates/sparrow-runtime/src/execute.rs +++ b/crates/sparrow-runtime/src/execute.rs @@ -75,7 +75,7 @@ pub async fn execute( data_context, options, None, - &HashMap::new(), + HashMap::new(), ) .await } @@ -234,7 +234,7 @@ pub async fn execute_new( mut data_context: DataContext, options: ExecutionOptions, key_hash_inverse: Option>, - udfs: &HashMap>, + udfs: HashMap>, ) -> error_stack::Result>, Error> { let object_stores = Arc::new(ObjectStoreRegistry::default()); @@ -280,6 +280,7 @@ pub async fn execute_new( output_at_time, bounded_lateness_ns: options.bounded_lateness_ns, materialize: options.materialize, + udfs, }; // Start executing the query. We pass the response channel to the @@ -301,7 +302,6 @@ pub async fn execute_new( progress_updates_rx, destination, options.stop_signal_rx, - udfs, ) .await .change_context(Error::internal_msg("spawn compute executor"))?; @@ -346,5 +346,5 @@ pub async fn materialize( // TODO: Unimplemented feature - UDFs let udfs = HashMap::new(); - execute_new(plan, destination, data_context, options, None, &udfs).await + execute_new(plan, destination, data_context, options, None, udfs).await } diff --git a/crates/sparrow-runtime/src/execute/compute_executor.rs b/crates/sparrow-runtime/src/execute/compute_executor.rs index 7faf111a9..155869f5b 100644 --- a/crates/sparrow-runtime/src/execute/compute_executor.rs +++ b/crates/sparrow-runtime/src/execute/compute_executor.rs @@ -5,19 +5,16 @@ use enum_map::EnumMap; use error_stack::{IntoReport, IntoReportCompat, ResultExt}; use futures::stream::{FuturesUnordered, PollNext}; use futures::{FutureExt, Stream, TryFutureExt}; -use hashbrown::HashMap; use prost_wkt_types::Timestamp; use sparrow_api::kaskada::v1alpha::ComputeSnapshot; use sparrow_api::kaskada::v1alpha::{ExecuteResponse, LateBoundValue, PlanHash}; use sparrow_arrow::scalar_value::ScalarValue; -use sparrow_instructions::Udf; use sparrow_qfr::io::writer::FlightRecordWriter; use sparrow_qfr::kaskada::sparrow::v1alpha::FlightRecordHeader; use sparrow_qfr::FlightRecorderFactory; use tokio_stream::wrappers::UnboundedReceiverStream; use tokio_stream::StreamExt; use tracing::{error, info, info_span, Instrument}; -use uuid::Uuid; use crate::execute::compute_store_guard::ComputeStoreGuard; use crate::execute::operation::{OperationContext, OperationExecutor}; @@ -60,7 +57,6 @@ impl ComputeExecutor { progress_updates_rx: tokio::sync::mpsc::Receiver, destination: Destination, stop_signal_rx: Option>, - udfs: &HashMap>, ) -> error_stack::Result { let mut spawner = ComputeTaskSpawner::new(); @@ -152,7 +148,6 @@ impl ComputeExecutor { max_event_time_tx.clone(), late_bindings, stop_signal_rx.clone(), - udfs, ) .await?, ); diff --git a/crates/sparrow-runtime/src/execute/operation.rs b/crates/sparrow-runtime/src/execute/operation.rs index b642d0714..a2a558fa1 100644 --- a/crates/sparrow-runtime/src/execute/operation.rs +++ b/crates/sparrow-runtime/src/execute/operation.rs @@ -105,6 +105,8 @@ pub(crate) struct OperationContext { /// /// Derived from the ExecutionOptions, pub materialize: bool, + /// Mapping of uuid to user-defined functions. + pub udfs: HashMap>, } impl OperationContext { @@ -169,7 +171,6 @@ impl OperationExecutor { /// This function will not return until the operation is complete -- either /// because all inputs have been processed or because it has been /// interrupted due to an error or reaching a preview-rows limit, etc. - #[allow(clippy::too_many_arguments)] pub async fn execute( self, operation_index: usize, @@ -178,7 +179,6 @@ impl OperationExecutor { max_event_time_tx: tokio::sync::mpsc::UnboundedSender, late_bindings: &EnumMap>, stop_signal_rx: Option>, - udfs: &HashMap>, ) -> Result>, Error> { let Self { operation, @@ -195,7 +195,7 @@ impl OperationExecutor { operation_label, operation.expressions, late_bindings, - udfs, + &context.udfs, ) .into_report() .change_context(Error::internal_msg("unable to create executor"))?; diff --git a/crates/sparrow-runtime/src/execute/operation/scan.rs b/crates/sparrow-runtime/src/execute/operation/scan.rs index a1f2ae204..1bb1c3eb4 100644 --- a/crates/sparrow-runtime/src/execute/operation/scan.rs +++ b/crates/sparrow-runtime/src/execute/operation/scan.rs @@ -510,6 +510,7 @@ mod tests { output_at_time: None, bounded_lateness_ns: None, materialize: false, + udfs: HashMap::new(), }; executor @@ -520,7 +521,6 @@ mod tests { max_event_tx, &Default::default(), None, - &HashMap::new(), ) .await .unwrap() diff --git a/crates/sparrow-runtime/src/execute/operation/testing.rs b/crates/sparrow-runtime/src/execute/operation/testing.rs index 0a89feb7d..1ef6b5bb1 100644 --- a/crates/sparrow-runtime/src/execute/operation/testing.rs +++ b/crates/sparrow-runtime/src/execute/operation/testing.rs @@ -190,6 +190,7 @@ pub(super) async fn run_operation( output_at_time: None, bounded_lateness_ns: None, materialize: false, + udfs: HashMap::new(), }; executor .execute( @@ -199,7 +200,6 @@ pub(super) async fn run_operation( max_event_tx, &Default::default(), None, - &HashMap::new(), ) .await .unwrap() @@ -248,6 +248,7 @@ pub(super) async fn run_operation_json( output_at_time: None, bounded_lateness_ns: None, materialize: false, + udfs: HashMap::new(), }; executor .execute( @@ -257,7 +258,6 @@ pub(super) async fn run_operation_json( max_event_tx, &Default::default(), None, - &HashMap::new(), ) .await .unwrap() diff --git a/crates/sparrow-session/src/session.rs b/crates/sparrow-session/src/session.rs index e1f1cb908..d7edbd738 100644 --- a/crates/sparrow-session/src/session.rs +++ b/crates/sparrow-session/src/session.rs @@ -423,7 +423,7 @@ impl Session { data_context, options, Some(key_hash_inverse), - &self.udfs, + self.udfs.clone(), )) .change_context(Error::Execute)? .map_err(|e| e.change_context(Error::Execute)) diff --git a/python/.vscode/settings.json b/python/.vscode/settings.json new file mode 100644 index 000000000..23fd35f0e --- /dev/null +++ b/python/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "editor.formatOnSave": true +} \ No newline at end of file diff --git a/python/pysrc/kaskada/__init__.py b/python/pysrc/kaskada/__init__.py index b4d5db186..bc5121cc2 100644 --- a/python/pysrc/kaskada/__init__.py +++ b/python/pysrc/kaskada/__init__.py @@ -10,7 +10,6 @@ from ._timestream import Literal from ._timestream import Timestream from ._timestream import record -from .udf import Udf from .udf import udf @@ -23,7 +22,6 @@ "Result", "sources", "Timestream", - "Udf", "udf", "windows", ] diff --git a/python/pysrc/kaskada/udf.py b/python/pysrc/kaskada/udf.py index 8d7058257..15f06d8f0 100644 --- a/python/pysrc/kaskada/udf.py +++ b/python/pysrc/kaskada/udf.py @@ -21,7 +21,6 @@ class Udf: def __init__(self, signature: str, func: Callable[..., pa.Array]) -> None: """Create a UDF with the given signature.""" - functools.update_wrapper(self, func) self._ffi = _ffi.Udf(signature, func) def __call__(self, *args: Timestream | Literal) -> Timestream: @@ -32,31 +31,35 @@ def __call__(self, *args: Timestream | Literal) -> Timestream: def udf(signature: str): """Decorate a function for use as a Kaskada UDF.""" - def decorator(func_type: FuncType): + def decorator(user_func: FuncType): # 1. Convert the `FuncType` to the type expected by Udf. # This needs to take the PyArrow result type and PyArrow inputs, # convert them to Pandas, call the function, and convert the result # back to PyArrow. def func(result_type, *args): - return _converted_func(func_type, result_type, *args) + return _converted_func(user_func, result_type, *args) # 2. Create the UDF object. - return Udf(signature, func) + udf = Udf(signature, func) + + # 3. Update the udf with the user function's documentation + functools.update_wrapper(udf, user_func) + + return udf return decorator def _converted_func( - func_type: FuncType, result_type: pa.DataType, *args: pa.Array + user_func: FuncType, result_type: pa.DataType, *args: pa.Array ) -> pa.Array: """Run the function producing the given result type.""" # TODO: I believe this will return a series for simple arrays, and a # dataframe for struct arrays. We should explore how this handles # different types. pd_args = [arg.to_pandas() for arg in args] - pd_result = func_type(*pd_args) + pd_result = user_func(*pd_args) - print("Result type: ", result_type) if isinstance(pd_result, pd.Series): return pa.Array.from_pandas(pd_result, type=result_type) else: diff --git a/python/pytests/udf_test.py b/python/pytests/udf_test.py index f05b4e4b1..e9bbd788b 100644 --- a/python/pytests/udf_test.py +++ b/python/pytests/udf_test.py @@ -2,6 +2,7 @@ import kaskada._ffi as _ffi import pandas as pd import pytest +from kaskada.udf import Udf @kd.udf("add(x: N, y: N) -> N") @@ -16,16 +17,15 @@ def add_x2(x: pd.Series, y: pd.Series) -> pd.Series: return (x + y) * 2 -@pytest.mark.skip(reason="currently not working") -def test_docstring() -> None: - assert add.__doc__ == "Use Pandas to add two numbers." - - def test_udf_instance() -> None: - assert isinstance(add, kd.Udf) + assert isinstance(add, Udf) assert isinstance(add._ffi, _ffi.Udf) +def test_docstring() -> None: + assert add.__doc__ == "Use Pandas to add two numbers." + + @pytest.fixture def source_int64() -> kd.sources.CsvString: content = "\n".join(