From 9ce0d00fb737bfc0a0ca540e567305e2cca0c0ee Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 9 Dec 2021 01:52:53 +0000 Subject: [PATCH 1/7] improve to native types --- slide/utils.py | 311 ++++++++++++++++++++++++++++++++++---------- slide_test/suite.py | 282 ++++++++++++++++++++++++--------------- 2 files changed, 418 insertions(+), 175 deletions(-) diff --git a/slide/utils.py b/slide/utils.py index 5c602be..2f3f312 100644 --- a/slide/utils.py +++ b/slide/utils.py @@ -16,9 +16,9 @@ from triad.utils.assertion import assert_or_throw from triad.utils.pyarrow import ( TRIAD_DEFAULT_TIMESTAMP, - apply_schema, to_pa_datatype, to_single_pandas_dtype, + _TypeConverter, ) from slide._string_utils import LikeExpr, LikeExprShortcut @@ -262,27 +262,49 @@ def cast( # noqa: C901 nulls or strings, where the pandas dtype may not provide the accurate type information. """ - to_type = self.to_safe_pa_type(type_obj) - t_type = to_single_pandas_dtype(to_type, use_extension_types=True) try: if self.is_series(col): - try: - inf_type = self.get_col_pa_type(col) - has_hint = input_type is not None - from_type = ( - inf_type if not has_hint else self.to_safe_pa_type(input_type) + to_type = self.to_safe_pa_type(type_obj) + input_pa_type = ( + None if input_type is None else self.to_safe_pa_type(input_type) + ) + if ( # nested/binary as input/output + pa.types.is_nested(to_type) + or pa.types.is_binary(to_type) + or ( + input_pa_type is not None + and ( + pa.types.is_nested(input_pa_type) + or pa.types.is_binary(input_pa_type) + ) + ) + ): + assert_or_throw( + pd.api.types.is_object_dtype(col.dtype), + SlideCastError(f"unexpected column type {col.dtype}"), + ) + assert_or_throw( + input_type is None + or self.to_safe_pa_type(input_type) == to_type, + SlideCastError(f"unexpected column type hint {input_type}"), ) - if pa.types.is_string(to_type): - if ( - has_hint - and pa.types.is_string(from_type) - and pa.types.is_string(inf_type) - ): - return col - elif from_type == inf_type == to_type: + return col + + t_type = to_single_pandas_dtype(to_type, use_extension_types=True) + inf_type = self.get_col_pa_type(col) + has_hint = input_type is not None + from_type = input_pa_type or inf_type + + if pa.types.is_string(to_type): + if ( + has_hint + and pa.types.is_string(from_type) + and pa.types.is_string(inf_type) + ): return col - except Exception: # pragma: no cover - return col.astype(t_type) + elif from_type == inf_type == to_type: + return col + if pa.types.is_boolean(to_type): return self._cast_to_bool(col, from_type, inf_type, t_type) if pa.types.is_integer(to_type): @@ -295,7 +317,7 @@ def cast( # noqa: C901 return self._cast_to_date(col, from_type, inf_type, t_type) elif pa.types.is_string(to_type): return self._cast_to_str(col, from_type, inf_type, t_type) - return col.astype(t_type) + return col.astype(t_type) # pragma: no cover else: if col is None: return None @@ -313,7 +335,7 @@ def cast( # noqa: C901 return None return res except (TypeError, ValueError) as te: - raise SlideCastError(f"unable to cast from {from_type} to {t_type}") from te + raise SlideCastError(str(te)) from te def filter_df(self, df: TDf, cond: Any) -> TDf: """Filter dataframe by a boolean series or a constant @@ -645,63 +667,23 @@ def as_pandas(self, df: TDf) -> pd.DataFrame: """ raise NotImplementedError # pragma: no cover - def as_array_iterable( + def create_native_converter( self, - df: TDf, - schema: pa.Schema, - columns: Optional[List[str]] = None, - type_safe: bool = False, - ) -> Iterable[List[Any]]: - """Convert pandas like dataframe to iterable of rows in the format of list. + input_schema: pa.Schema, + type_safe: bool, + ) -> "SlideDataFrameNativeConverter": + """Create a converter that convert the dataframes into python native iterables - :param df: pandas like dataframe - :param schema: schema of the input - :param columns: columns to output, None for all columns + :param input_schema: schema of the input dataframe :param type_safe: whether to enforce the types in schema, if False, it will - return the original values from the dataframe - :return: iterable of rows, each row is a list + return the original values from the dataframes + :return: the converter - .. note:: + .. tip:: - If there are nested types in schema, the conversion can be slower + This converter can be reused on multiple dataframes with the same structure """ - if self.empty(df): - return - if columns is not None: - df = df[columns] - schema = pa.schema([schema.field(n) for n in columns]) - if not type_safe: - for arr in df.astype(object).itertuples(index=False, name=None): - yield list(arr) - elif all(not pa.types.is_nested(x) for x in schema.types): - p = self.as_arrow(df, schema, type_safe) - d = p.to_pydict() - cols = [d[n] for n in schema.names] - for arr in zip(*cols): - yield list(arr) - else: - # If schema has nested types, the conversion will be much slower - for arr in apply_schema( - schema, - df.itertuples(index=False, name=None), - copy=True, - deep=True, - str_as_json=True, - ): - yield arr - - def as_array( - self, - df: TDf, - schema, - columns: Optional[List[str]] = None, - type_safe: bool = False, - ) -> List[List[Any]]: - return list( - self.as_array_iterable( - df, schema=schema, columns=columns, type_safe=type_safe - ) - ) + return SlideDataFrameNativeConverter(self, input_schema, type_safe) def to_schema(self, df: TDf) -> pa.Schema: """Extract pandas dataframe schema as pyarrow schema. This is a replacement @@ -1120,3 +1102,190 @@ def _cast_to_date( if pd.__version__ < "1.2": # pragma: no cover return col.astype(safe_dtype).dt.floor("D") return col.astype(safe_dtype).dt.date + + +class SlideDataFrameNativeConverter: + def __init__( + self, + utils: SlideUtils, + schema: pa.Schema, + type_safe: bool, + ): + """Convert pandas like dataframe to iterable of rows in the format of list. + + :param utils: the associated SlideUtils + :param schema: schema of the input dataframe + :param type_safe: whether to enforce the types in schema, if False, it will + return the original values from the dataframes + + .. note:: + + If there are nested types in schema, the conversion can be slower + """ + self._utils = utils + self._schema = schema + self._has_time = any( + pa.types.is_timestamp(x) or pa.types.is_date(x) for x in schema.types + ) + if not type_safe: + self._as_array_iterable = self._as_array_iterable_not_type_safe + self._as_arrays = self._as_arrays_not_type_safe + self._as_dict_iterable = self._as_dict_iterable_not_type_safe + self._as_dicts = self._as_dicts_not_type_safe + else: + self._split_nested(self._schema) + if self._converter is None: + self._as_array_iterable = self._as_array_iterable_simple + self._as_arrays = self._as_arrays_simple + self._as_dict_iterable = self._as_dict_iterable_simple + self._as_dicts = self._as_dicts_simple + elif self._simple_part is None: + self._as_array_iterable = self._as_array_iterable_nested + self._as_arrays = self._as_arrays_nested + self._as_dict_iterable = self._as_dict_iterable_nested + self._as_dicts = self._as_dicts_nested + else: + self._as_array_iterable = self._as_array_iterable_hybrid + self._as_arrays = self._as_arrays_hybrid + self._as_dict_iterable = self._as_dict_iterable_hybrid + self._as_dicts = self._as_dicts_hybrid + pass + + def as_array_iterable(self, df: Any) -> Iterable[List[Any]]: + """Convert the dataframe to an iterable of rows in the format of list. + + :param df: the dataframe + :return: an iterable of rows, each row is a list + """ + return self._as_array_iterable(df) + + def as_arrays(self, df: Any) -> List[List[Any]]: + """Convert the dataframe to a list of rows in the format of list. + + :param df: the dataframe + :return: a list of rows, each row is a list + """ + return self._as_arrays(df) + + def as_dict_iterable(self, df: Any) -> Iterable[Dict[str, Any]]: + """Convert the dataframe to an iterable of rows in the format of dict. + + :param df: the dataframe + :return: an iterable of rows, each row is a dict + """ + return self._as_dict_iterable(df) + + def as_dicts(self, df: Any) -> List[Dict[str, Any]]: + """Convert the dataframe to a list of rows in the format of dict. + + :param df: the dataframe + :return: a list of rows, each row is a dict + """ + return self._as_dicts(df) + + def _time_safe(self, df: Any) -> Any: + return df.astype(object) if self._has_time else df + + def _as_array_iterable_not_type_safe(self, df: Any) -> Iterable[List[Any]]: + for arr in self._time_safe(df).itertuples(index=False, name=None): + yield list(arr) + + def _as_arrays_not_type_safe(self, df: Any) -> List[List[Any]]: + return self._time_safe(self._utils.as_pandas(df)).values.tolist() + + def _as_dict_iterable_not_type_safe(self, df: Any) -> Iterable[Dict[str, Any]]: + names = list(self._schema.names) + for arr in self._time_safe(df).itertuples(index=False, name=None): + yield dict(zip(names, arr)) + + def _as_dicts_not_type_safe(self, df: Any) -> List[Dict[str, Any]]: + return self._time_safe(self._utils.as_pandas(df)).to_dict("records") + + def _as_array_iterable_simple(self, df: Any) -> Iterable[List[Any]]: + return self._get_arrow_arrays_simple(df, self._schema) + + def _as_arrays_simple(self, df: Any) -> List[List[Any]]: + return list(self._get_arrow_arrays_simple(df, self._schema)) + + def _as_dict_iterable_simple(self, df: Any) -> Iterable[Dict[str, Any]]: + for arr in self._get_arrow_arrays_simple(df, self._schema): + yield dict(zip(self._schema.names, arr)) + + def _as_dicts_simple(self, df: Any) -> List[Dict[str, Any]]: + return list(self._as_dict_iterable_simple(df)) + + def _as_array_iterable_hybrid(self, df: Any) -> Iterable[List[Any]]: + for arr1, arr2 in zip(self._simple_part(df), self._nested_part(df)): + yield self._remap_arrs(arr1, arr2) + + def _as_arrays_hybrid(self, df: Any) -> List[List[Any]]: + return list(self._as_array_iterable_hybrid(df)) + + def _as_dict_iterable_hybrid(self, df: Any) -> Iterable[Dict[str, Any]]: + names = list(self._schema.names) + for arr in self._as_array_iterable_hybrid(df): + yield dict(zip(names, arr)) + + def _as_dicts_hybrid(self, df: Any) -> List[Dict[str, Any]]: + return list(self._as_dict_iterable_hybrid(df)) + + def _as_array_iterable_nested(self, df: Any) -> Iterable[List[Any]]: + return self._nested_part(df) + + def _as_arrays_nested(self, df: Any) -> List[List[Any]]: + return list(self._nested_part(df)) + + def _as_dict_iterable_nested(self, df: Any) -> Iterable[Dict[str, Any]]: + names = list(self._schema.names) + for arr in self._nested_part(df): + yield dict(zip(names, arr)) + + def _as_dicts_nested(self, df: Any) -> List[Dict[str, Any]]: + return list(self._as_dict_iterable_nested(df)) + + def _split_nested(self, schema: pa.Schema) -> None: + cols1: List[pa.Field] = [] + cols2: List[pa.Field] = [] + self._remap: List[Tuple[int, int]] = [] + for field in schema: + if pa.types.is_nested(field.type): + self._remap.append((1, len(cols2))) + cols2.append(field) + else: + self._remap.append((0, len(cols1))) + cols1.append(field) + self._simple_schema = pa.schema(cols1) + self._simple_part: Any = ( + None + if len(cols1) == 0 + else lambda df: self._get_arrow_arrays_simple( + df[self._simple_schema.names], self._simple_schema + ) + ) + self._nested_schema = pa.schema(cols2) + self._converter: Any = ( + None + if len(cols2) == 0 + else _TypeConverter( + pa.schema(cols2), copy=True, deep=True, str_as_json=True + ) + ) + self._nested_part = lambda df: self._get_arrays_nested( + df[self._nested_schema.names], self._nested_schema + ) + + def _remap_arrs(self, *arrs: List[List[Any]]) -> List[Any]: + return [arrs[x[0]][x[1]] for x in self._remap] + + def _get_arrow_arrays_simple( + self, df: Any, schema: pa.Schema + ) -> Iterable[List[Any]]: + p = self._utils.as_arrow(df, schema, True) + d = p.to_pydict() + cols = [d[n] for n in schema.names] + for arr in zip(*cols): + yield list(arr) + + def _get_arrays_nested(self, df: Any, schema: pa.Schema) -> Iterable[List[Any]]: + for item in df.itertuples(index=False, name=None): + yield self._converter.row_to_py(item) diff --git a/slide_test/suite.py b/slide_test/suite.py index 954412b..835f246 100644 --- a/slide_test/suite.py +++ b/slide_test/suite.py @@ -1,5 +1,4 @@ import json -import pickle from datetime import date, datetime from typing import Any from unittest import TestCase @@ -1793,6 +1792,41 @@ def test_cast_time(self): check_order=False, ) + def test_cast_nested(self): + # happy path + pdf = pd.DataFrame(dict(a=[None, [1, 2]], b=[{"d": "x"}, None])) + + schema = Schema("a:[int],b:{d:str}").pa_schema + df = self.to_df(pdf, "a:[int],b:{d:str}") + df["h"] = self.utils.cast(df.a, schema[0].type) + df["i"] = self.utils.cast(df.b, schema[1].type, schema[1].type) + + assert [[None, {"d": "x"}], [[1, 2], None]] == self.to_pd( + df[list("hi")] + ).values.tolist() + + with raises(SlideCastError): + df["j"] = self.utils.cast(df.a, schema[0].type, str) + + with raises(SlideCastError): + df["j"] = self.utils.cast(df.a, str, schema[0].type) + + def test_cast_binary(self): + # happy path + pdf = pd.DataFrame(dict(a=[None, b"\0abc"])) + + schema = Schema("a:binary").pa_schema + df = self.to_df(pdf, "a:binary") + df["h"] = self.utils.cast(df.a, schema[0].type) + + assert [[None], [b"\0abc"]] == self.to_pd(df[["h"]]).values.tolist() + + with raises(SlideCastError): + df["j"] = self.utils.cast(df.a, schema[0].type, str) + + with raises(SlideCastError): + df["j"] = self.utils.cast(df.a, str, schema[0].type) + def test_cast_df(self): a = pd.DataFrame(dict(a=[1, 2, None], b=[True, None, False])) df = self.utils.cast_df( @@ -1892,130 +1926,170 @@ def test_index_compatible(self): df = df.reset_index(drop=True) self.utils.ensure_compatible(df) - def test_as_array_iterable(self): + def test_converter_not_safe(self): schema = Schema("a:str,b:int").pa_schema + c = self.utils.create_native_converter(schema, type_safe=False) df = self.to_df([], "a:str,b:int") - assert [] == self.utils.as_array(df, schema) - assert [] == self.utils.as_array(df, schema, type_safe=True) + assert [] == c.as_arrays(df) + assert [] == list(c.as_array_iterable(df)) + assert [] == c.as_dicts(df) + assert [] == list(c.as_dict_iterable(df)) - df = self.to_df([["a", 1]], "a:str,b:int") - assert [["a", 1]] == self.utils.as_array(df, schema) - assert [["a", 1]] == self.utils.as_array(df, schema, columns=["a", "b"]) - assert [[1, "a"]] == self.utils.as_array(df, schema, columns=["b", "a"]) + df = self.to_df([["xx", 123]], "a:str,b:int") + assert [["xx", 123]] == c.as_arrays(df) + assert [["xx", 123]] == list(c.as_array_iterable(df)) + assert [{"a": "xx", "b": 123}] == c.as_dicts(df) + assert [{"a": "xx", "b": 123}] == list(c.as_dict_iterable(df)) - # prevent pandas auto type casting schema = Schema("a:double,b:int").pa_schema - df = self.to_df([[1.0, 1.0]], "a:double,b:int") - data = self.utils.as_array(df, schema) - assert [[1.0, 1]] == data - assert isinstance(data[0][0], float) - assert isinstance(data[0][1], int) - assert [[1.0, 1]] == self.utils.as_array(df, schema, columns=["a", "b"]) - assert [[1, 1.0]] == self.utils.as_array(df, schema, columns=["b", "a"]) - - df = self.to_df([[np.float64(1.0), 1.0]], "a:double,b:int") - assert [[1.0, 1]] == self.utils.as_array(df, schema) - assert isinstance(self.utils.as_array(df, schema)[0][0], float) - assert isinstance(self.utils.as_array(df, schema)[0][1], int) - - schema = Schema("a:datetime,b:int").pa_schema + c = self.utils.create_native_converter(schema, type_safe=False) + df = self.to_df([[1.1, 1.1]], "a:double,b:double") + res = c.as_arrays(df) + assert [[1.1, 1.1]] == res + assert isinstance(res[0][1], float) + + schema = Schema("a:datetime,b:date").pa_schema + c = self.utils.create_native_converter(schema, type_safe=False) df = self.to_df( - [[pd.Timestamp("2020-01-01"), 1.0]], - "a:datetime,b:int", - ) - assert [[datetime(2020, 1, 1), 1]] == self.utils.as_array(df, schema) - assert isinstance( - self.utils.as_array(df, schema, type_safe=True)[0][0], datetime - ) - assert isinstance( - self.utils.as_array(df, schema, type_safe=True)[0][1], int - ) - - df = self.to_df([[pd.NaT, 1.0]], "a:datetime,b:int") - assert self.utils.as_array(df, schema, type_safe=True)[0][0] is None - assert isinstance( - self.utils.as_array(df, schema, type_safe=True)[0][1], int - ) - - schema = Schema("a:double,b:int").pa_schema - df = self.to_df([[1.0, 1.0]], "a:double,b:int") - assert [[1.0, 1]] == self.utils.as_array(df, schema, type_safe=True) - assert isinstance(self.utils.as_array(df, schema)[0][0], float) - assert isinstance(self.utils.as_array(df, schema)[0][1], int) - - def test_as_array_iterable_datetime(self): + [[datetime(2020, 1, 1), date(2020, 1, 2)]], "a:datetime,b:date" + ) + res = c.as_arrays(df) + assert [[datetime(2020, 1, 1), date(2020, 1, 2)]] == res + + def test_converter_simple(self): + schema = Schema("a:bool,b:int").pa_schema + c = self.utils.create_native_converter(schema, type_safe=True) + df = self.to_df([[True, None], [None, 1], [None, None]], "a:bool,b:int") + res = c.as_arrays(df) + expected = [[True, None], [None, 1], [None, None]] + assert expected == res + res = list(c.as_array_iterable(df)) + assert expected == res + res = c.as_dicts(df) + assert [dict(zip(["a", "b"], x)) for x in expected] == res + res = list(c.as_dict_iterable(df)) + assert [dict(zip(["a", "b"], x)) for x in expected] == res + + schema = Schema("a:str,b:double").pa_schema + c = self.utils.create_native_converter(schema, type_safe=True) + df = self.to_df([["ab", None], [None, 1.1], [None, None]], "a:str,b:double") + res = list(c.as_array_iterable(df)) + expected = [["ab", None], [None, 1.1], [None, None]] + assert expected == res + res = list(c.as_array_iterable(df)) + assert expected == res + res = c.as_dicts(df) + assert [dict(zip(["a", "b"], x)) for x in expected] == res + res = list(c.as_dict_iterable(df)) + assert [dict(zip(["a", "b"], x)) for x in expected] == res + + schema = Schema("a:datetime,b:date").pa_schema + c = self.utils.create_native_converter(schema, type_safe=True) df = self.to_df( - [[datetime(2020, 1, 1, 2, 3, 4, 5), date(2020, 2, 2)]], - columns="a:datetime,b:date", - ) - v1 = list( - self.utils.as_array_iterable( - df, schema=expression_to_schema("a:datetime,b:date"), type_safe=True - ) - )[0] - assert not isinstance(v1[0], pd.Timestamp) - assert isinstance(v1[0], datetime) - assert isinstance(v1[1], date) - - def test_nested(self): - # data = [[dict(b=[30, "40"])]] - # s = expression_to_schema("a:{a:str,b:[int]}") - # df = self.to_df(data, "a:{a:str,b:[int]}") - # a = df.as_array(type_safe=True) - # assert [[dict(a=None, b=[30, 40])]] == a - - data = [[[json.dumps(dict(b=[30, "40"]))]]] - df = self.to_df(data, "a:[{a:str,b:[int]}]", coerce=False) - a = self.utils.as_array( - df, schema=Schema("a:[{a:str,b:[int]}]").pa_schema, type_safe=True - ) - assert [[[dict(a=None, b=[30, 40])]]] == a + [[datetime(2020, 1, 1), datetime(2020, 1, 2)]], "a:datetime,b:date" + ) + res = c.as_arrays(df) + expected = [[datetime(2020, 1, 1), date(2020, 1, 2)]] + assert expected == res + assert isinstance(res[0][0], datetime) + assert isinstance(res[0][1], date) + res = list(c.as_array_iterable(df)) + assert expected == res + res = c.as_dicts(df) + assert [dict(zip(["a", "b"], x)) for x in expected] == res + res = list(c.as_dict_iterable(df)) + assert [dict(zip(["a", "b"], x)) for x in expected] == res + + def test_converter_nested(self): + data = [[dict(b=[30, "40"])]] + schema = expression_to_schema("a:{a:str,b:[int]}") + c = self.utils.create_native_converter(schema, type_safe=True) + df = self.to_df(data, "a:{a:str,b:[int]}") + a = c.as_arrays(df) + assert [[dict(a=None, b=[30, 40])]] == a + a = list(c.as_array_iterable(df)) + assert [[dict(a=None, b=[30, 40])]] == a + a = c.as_dicts(df) + assert [{"a": dict(a=None, b=[30, 40])}] == a + a = list(c.as_dict_iterable(df)) + assert [{"a": dict(a=None, b=[30, 40])}] == a + + data = [[json.dumps(dict(b=[30, "40"]))]] + df = self.to_df(data, "a:str") + a = c.as_arrays(df) + assert [[dict(a=None, b=[30, 40])]] == a + + data = [[["1", 2]]] + schema = expression_to_schema("a:[int]") + c = self.utils.create_native_converter(schema, type_safe=True) + df = self.to_df(data, "a:[int]") + a = c.as_arrays(df) + assert [[[1, 2]]] == a data = [[json.dumps(["1", 2])]] - df = self.to_df(data, "a:[int]", coerce=False) - a = self.utils.as_array( - df, schema=Schema("a:[int]").pa_schema, type_safe=True - ) + schema = expression_to_schema("a:[int]") + c = self.utils.create_native_converter(schema, type_safe=True) + df = self.to_df(data, "a:str") + a = c.as_arrays(df) assert [[[1, 2]]] == a - def test_binary(self): - b = pickle.dumps("xyz") - data = [[b, b"xy"]] - df = self.to_df(data, "a:bytes,b:bytes") - a = self.utils.as_array( - df, schema=Schema("a:bytes,b:bytes").pa_schema, type_safe=True - ) - assert [[b, b"xy"]] == a - - def test_nan_none(self): + def test_converter_binary(self): + data = [[b"\0abc"]] + schema = expression_to_schema("a:binary") + c = self.utils.create_native_converter(schema, type_safe=True) + df = self.to_df(data, "a:binary") + a = c.as_arrays(df) + assert [[b"\0abc"]] == a + a = list(c.as_array_iterable(df)) + assert [[b"\0abc"]] == a + a = c.as_dicts(df) + assert [{"a": b"\0abc"}] == a + a = list(c.as_dict_iterable(df)) + assert [{"a": b"\0abc"}] == a + + def test_converter_hybrid(self): + data = [[b"\0abc", 1, [1, 2], "ab"]] + schema = expression_to_schema("a:binary,b:long,c:[long],d:str") + c = self.utils.create_native_converter(schema, type_safe=True) + df = self.to_df(data, "a:binary,b:long,c:[long],d:str") + a = c.as_arrays(df) + assert data == a + a = list(c.as_array_iterable(df)) + assert data == a + a = c.as_dicts(df) + assert [{"a": b"\0abc", "b": 1, "c": [1, 2], "d": "ab"}] == a + a = list(c.as_dict_iterable(df)) + assert [{"a": b"\0abc", "b": 1, "c": [1, 2], "d": "ab"}] == a + + def test_converter_nan_none(self): schema = Schema("b:str,c:double").pa_schema + c = self.utils.create_native_converter(schema, type_safe=True) df = self.to_df([[None, None]], "b:str,c:double") - arr = self.utils.as_array(df, schema, type_safe=True)[0] - assert arr[0] is None - assert arr[1] is None - - df = self.to_df([], "b:str,c:double") - assert len(self.utils.as_array(df, schema)) == 0 + a = c.as_arrays(df) + assert a[0][0] is None + assert a[0][1] is None schema = Schema("b:int,c:bool").pa_schema + c = self.utils.create_native_converter(schema, type_safe=True) df = self.to_df([[None, None]], "b:int,c:bool") - arr = self.utils.as_array(df, schema, type_safe=True)[0] - assert arr[0] is None - assert arr[1] is None + a = c.as_arrays(df) + assert a[0][0] is None + assert a[0][1] is None - def test_boolean_enforce(self): + def test_converter_boolean(self): schema = Schema("b:int,c:bool").pa_schema + c = self.utils.create_native_converter(schema, type_safe=True) df = self.to_df([[1, True], [2, False], [3, None]], "b:int,c:bool") - arr = self.utils.as_array(df, schema, type_safe=True) - assert [[1, True], [2, False], [3, None]] == arr + a = c.as_arrays(df) + assert [[1, True], [2, False], [3, None]] == a - df = self.to_df([[1, 1], [2, 0]], "b:int,c:bool") - arr = self.utils.as_array(df, schema, type_safe=True) - assert [[1, True], [2, False]] == arr + df = self.to_df([[1, 1], [2, 0]], "b:int,c:int") + a = c.as_arrays(df) + assert [[1, True], [2, False]] == a - df = self.to_df([[1, 1.0], [2, 0.0]], "b:int,c:bool") - arr = self.utils.as_array(df, schema, type_safe=True) - assert [[1, True], [2, False]] == arr + df = self.to_df([[1, 1.0], [2, 0.0]], "b:int,c:double") + a = c.as_arrays(df) + assert [[1, True], [2, False]] == a def test_sql_group_by_apply(self): df = self.to_df([["a", 1], ["a", 2], [None, 3]], "b:str,c:long") From 8a0af0790585d3ef1d0c48fe3aca8f159298a903 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 11 Dec 2021 00:09:09 +0000 Subject: [PATCH 2/7] fix unittests --- slide_test/suite.py | 3 ++- tests/slide_dask/test_utils.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/slide_test/suite.py b/slide_test/suite.py index 835f246..9c1612c 100644 --- a/slide_test/suite.py +++ b/slide_test/suite.py @@ -1816,7 +1816,8 @@ def test_cast_binary(self): pdf = pd.DataFrame(dict(a=[None, b"\0abc"])) schema = Schema("a:binary").pa_schema - df = self.to_df(pdf, "a:binary") + coerce = pd.__version__ >= "1.2" + df = self.to_df(pdf, "a:binary", coerce=coerce) df["h"] = self.utils.cast(df.a, schema[0].type) assert [[None], [b"\0abc"]] == self.to_pd(df[["h"]]).values.tolist() diff --git a/tests/slide_dask/test_utils.py b/tests/slide_dask/test_utils.py index 4c43ea5..2d50741 100644 --- a/tests/slide_dask/test_utils.py +++ b/tests/slide_dask/test_utils.py @@ -22,20 +22,25 @@ def to_df( columns: Any = None, coerce: bool = True, ): + def _get_pdf(df: pd.DataFrame) -> pd.DataFrame: + if coerce: + return df.convert_dtypes() + return df + if isinstance(columns, str): s = expression_to_schema(columns) df = dd.from_pandas( - pd.DataFrame(data, columns=s.names).convert_dtypes(), npartitions=2 + _get_pdf(pd.DataFrame(data, columns=s.names)), npartitions=2 ) if coerce: df = self.utils.cast_df(df, s) return df elif isinstance(data, list): return dd.from_pandas( - pd.DataFrame(data, columns=columns).convert_dtypes(), npartitions=2 + _get_pdf(pd.DataFrame(data, columns=columns)), npartitions=2 ) elif isinstance(data, pd.DataFrame): - return dd.from_pandas(data.convert_dtypes(), npartitions=2) + return dd.from_pandas(_get_pdf(data), npartitions=2) elif isinstance(data, dd.DataFrame): return data raise NotImplementedError From 8457a87207b87ad4e739287ddf3ca830a91a8bc2 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 12 Dec 2021 08:40:28 +0000 Subject: [PATCH 3/7] update execution plan --- slide/operators/__init__.py | 0 slide/operators/select_operators.py | 264 ++++++++++++++++++ slide/utils.py | 110 ++++---- slide_dask/utils.py | 79 ++++-- slide_pandas/__init__.py | 2 + slide_pandas/utils.py | 58 +++- slide_test/suite.py | 56 +++- .../slide/operators/test_select_operators.py | 141 ++++++++++ 8 files changed, 623 insertions(+), 87 deletions(-) create mode 100644 slide/operators/__init__.py create mode 100644 slide/operators/select_operators.py create mode 100644 tests/slide/operators/test_select_operators.py diff --git a/slide/operators/__init__.py b/slide/operators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/slide/operators/select_operators.py b/slide/operators/select_operators.py new file mode 100644 index 0000000..45e39b8 --- /dev/null +++ b/slide/operators/select_operators.py @@ -0,0 +1,264 @@ +from typing import Any, Dict, List, Optional, Tuple, Union + +import pyarrow as pa +from slide.utils import SlideUtils +from triad import Schema, to_uuid + + +class SelectOperator: + def __init__(self, *args: Any, **kwargs: Any): + self._args = args + self._kwargs = kwargs + self._uuid = to_uuid(self.identifier, self._args, self._kwargs) + pass + + @property + def identifier(self) -> str: + return str(type(self)) + + @property + def key(self) -> str: + return "_" + to_uuid(self)[:8] + + def execute(self, context: "SelectExecutionContext") -> None: + raise NotImplementedError # pragma: no cover + + @property + def output_type(self) -> pa.DataType: + raise NotImplementedError # pragma: no cover + + @property + def output_name(self) -> Optional[str]: + raise NotImplementedError # pragma: no cover + + def __uuid__(self) -> str: + return self._uuid + + +class GetColumn(SelectOperator): + def __init__(self, name: str, input_type: pa.DataType): + super().__init__(name) + self._name = name + self._output_type = input_type + + @property + def output_type(self) -> pa.DataType: + return self._output_type + + @property + def output_name(self) -> Optional[str]: + return self._name + + def execute(self, context: "SelectExecutionContext") -> None: + context[self] = context.df[self._name] + + +class LitColumn(SelectOperator): + def __init__(self, value: Any, input_type: Optional[pa.DataType] = None): + super().__init__(value) + self._value = value + self._output_type = pa.scalar(value, input_type).type + + @property + def output_type(self) -> pa.DataType: + return self._output_type + + def execute(self, context: "SelectExecutionContext") -> None: + context[self] = self._value + + +class UnaryOperator(SelectOperator): + def __init__(self, op: str, col: SelectOperator): + super().__init__(op, col) + self._op = op + self._col = col + self._output_type = self._get_output_type(op, col.output_type) + + @property + def output_type(self) -> pa.DataType: + return self._output_type + + @property + def output_name(self) -> Optional[str]: + return self._col.output_name + + def execute(self, context: "SelectExecutionContext") -> None: + if self._op in ["+", "-"]: + context[self] = context.utils.unary_arithmetic_op( + context[self._col], op=self._op + ) + elif self._op == "~": + context[self] = context.utils.logical_not(context[self._col]) + else: + raise NotImplementedError(self._op) # pragma: no cover + + def _get_output_type(self, op: str, input_type: pa.DataType) -> pa.DataType: + if op == "+": + if pa.types.is_integer(input_type) or pa.types.is_floating(input_type): + return input_type + elif op == "-": + if pa.types.is_integer(input_type): + return pa.int64() + if pa.types.is_floating(input_type): + return input_type + elif op == "~": + if pa.types.is_boolean(input_type): + return input_type + raise ValueError(f"'{op}' can't be applied to {input_type}") + + +class BinaryOperator(SelectOperator): + def __init__(self, op: str, col1: SelectOperator, col2: SelectOperator): + super().__init__(op, col1, col2) + self._op = op + self._col1 = col1 + self._col2 = col2 + self._output_type = self._get_output_type( + op, col1.output_type, col2.output_type + ) + + @property + def output_type(self) -> pa.DataType: + return self._output_type + + def execute(self, context: "SelectExecutionContext") -> None: + if self._op in ["+", "-", "*", "/"]: + context[self] = context.utils.binary_arithmetic_op( + context[self._col1], context[self._col2], op=self._op + ) + elif self._op in ["&", "|"]: + context[self] = context.utils.binary_logical_op( + context[self._col1], + context[self._col2], + op="and" if self._op == "&" else "or", + ) + else: + raise NotImplementedError(self._op) # pragma: no cover + + def _get_output_type( # noqa: C901 + self, op: str, t1: pa.DataType, t2: pa.DataType + ) -> pa.DataType: + if op == "+": + if pa.types.is_integer(t1): + if pa.types.is_integer(t2): + return pa.int64() + if pa.types.is_floating(t2): + return pa.float64() + elif pa.types.is_floating(t1): + if pa.types.is_integer(t2) or pa.types.is_floating(t2): + return pa.float64() + # TODO: time + interval + if op == "-": + if pa.types.is_integer(t1): + if pa.types.is_integer(t2): + return pa.int64() + if pa.types.is_floating(t2): + return pa.float64() + elif pa.types.is_floating(t1): + if pa.types.is_integer(t2) or pa.types.is_floating(t2): + return pa.float64() + # TODO: time - interval + # TODO: time - time + elif op in ["*", "/"]: + if pa.types.is_integer(t1): + if pa.types.is_integer(t2): + return pa.int64() + if pa.types.is_floating(t2): + return pa.float64() + elif pa.types.is_floating(t1): + if pa.types.is_integer(t2) or pa.types.is_floating(t2): + return pa.float64() + elif op in ["&", "|"]: + if (pa.types.is_boolean(t1) or pa.types.is_null(t1)) and ( + pa.types.is_boolean(t2) or pa.types.is_null(t2) + ): + return pa.boolean() + raise ValueError(f"'{op}' can't be applied to {t1} and {t2}") + + +class OutputOperator(SelectOperator): + def __init__(self, *args: Union[SelectOperator, Tuple[SelectOperator, str]]): + self._data: List[Any] = [ + (x, x.output_name) if isinstance(x, SelectOperator) else x for x in args + ] + super().__init__(*self._data) + + def execute(self, context: "SelectExecutionContext") -> None: + cols = [context[x] for x, _ in self._data] + names = [y for _, y in self._data] + context.set_output( + context.utils.cols_to_df(cols, names=names, reference=context.df) + ) + + +class SelectExecutionContext: + def __init__(self, utils: SlideUtils, df: Any): + self._utils = utils + self._df = df + self._output: Any = None + self._results: Dict[str, Any] = {} + + @property + def utils(self) -> SlideUtils: + return self._utils + + @property + def df(self) -> Any: + return self._df + + @property + def output(self) -> Any: + return self._output + + def set_output(self, df: Any) -> None: + self._output = df + + def __setitem__(self, op: SelectOperator, value: Any) -> None: + self._results[op.key] = value + + def __getitem__(self, op: SelectOperator) -> None: + return self._results[op.key] + + +class SelectExecutionPlan: + def __init__(self, input_schema: pa.Schema): + self._input_schema = input_schema + self._steps: List[SelectOperator] = [] + self._steps_dict: Dict[str, SelectOperator] = {} + + def add(self, op: SelectOperator) -> SelectOperator: + key = op.key + if key in self._steps_dict: + return self._steps_dict[key] + self._steps_dict[key] = op + self._steps.append(op) + return op + + def col(self, name: str) -> SelectOperator: + return self.add(GetColumn(name, self._input_schema.field_by_name(name).type)) + + def lit( + self, value: Any, input_type: Optional[pa.DataType] = None + ) -> SelectOperator: + return self.add(LitColumn(value, input_type)) + + def unary(self, op: str, col: SelectOperator) -> SelectOperator: + return self.add(UnaryOperator(op, col)) + + def binary( + self, op: str, col1: SelectOperator, col2: SelectOperator + ) -> SelectOperator: + return self.add(BinaryOperator(op, col1, col2)) + + def output(self, *args: Union[SelectOperator, Tuple[SelectOperator, str]]) -> None: + self.add(OutputOperator(*args)) + + def __len__(self) -> int: + return len(self._steps) + + def __uuid__(self) -> str: + return to_uuid(str(Schema(self._input_schema)), self._steps) + + def execute(self, context: SelectExecutionContext) -> None: + for step in self._steps: + step.execute(context) diff --git a/slide/utils.py b/slide/utils.py index 2f3f312..aee86d5 100644 --- a/slide/utils.py +++ b/slide/utils.py @@ -8,6 +8,7 @@ Optional, Tuple, TypeVar, + Union, ) import numpy as np @@ -102,17 +103,17 @@ def series_to_array(self, col: TCol) -> List[Any]: """ raise NotImplementedError # pragma: no cover - def to_constant_series( + def scalar_to_series( self, - constant: Any, - from_series: TCol, + scalar: Any, + reference: Union[TCol, TDf], dtype: Any = None, name: Optional[str] = None, ) -> TCol: # pragma: no cover - """Convert a constant to a series with the same index of ``from_series`` + """Convert a scalar to a series with the same index of ``reference`` - :param constant: the constant - :param from_series: the reference series for index + :param scalar: the scalar + :param reference: the reference series or dataframe for index :param dtype: default data type, defaults to None :param name: name of the series, defaults to None :return: the series @@ -120,9 +121,9 @@ def to_constant_series( raise NotImplementedError def get_col_pa_type(self, col: Any) -> pa.DataType: - """Get column or constant pyarrow data type + """Get column or scalar pyarrow data type - :param col: the column or the constant + :param col: the column or the scalar :return: pyarrow data type """ if self.is_series(col): @@ -133,11 +134,11 @@ def get_col_pa_type(self, col: Any) -> pa.DataType: return self.to_safe_pa_type(type(col)) def unary_arithmetic_op(self, col: Any, op: str) -> Any: - """Unary arithmetic operator on series/constants + """Unary arithmetic operator on series/scalars - :param col: a series or a constant + :param col: a series or a scalar :param op: can be ``+`` or ``-`` - :return: the transformed series or constant + :return: the transformed series or scalar :raises NotImplementedError: if ``op`` is not supported .. note: @@ -153,10 +154,10 @@ def unary_arithmetic_op(self, col: Any, op: str) -> Any: def binary_arithmetic_op(self, col1: Any, col2: Any, op: str) -> Any: """Binary arithmetic operations ``+``, ``-``, ``*``, ``/`` - :param col1: the first column (series or constant) - :param col2: the second column (series or constant) + :param col1: the first column (series or scalar) + :param col2: the second column (series or scalar) :param op: ``+``, ``-``, ``*``, ``/`` - :return: the result after the operation (series or constant) + :return: the result after the operation (series or scalar) :raises NotImplementedError: if ``op`` is not supported .. note: @@ -176,10 +177,10 @@ def binary_arithmetic_op(self, col1: Any, col2: Any, op: str) -> Any: def comparison_op(self, col1: Any, col2: Any, op: str) -> Any: """Binary comparison ``<``, ``<=``, ``==``, ``>``, ``>=`` - :param col1: the first column (series or constant) - :param col2: the second column (series or constant) + :param col1: the first column (series or scalar) + :param col2: the second column (series or scalar) :param op: ``<``, ``<=``, ``==``, ``>``, ``>=`` - :return: the result after the operation (series or constant) + :return: the result after the operation (series or scalar) :raises NotImplementedError: if ``op`` is not supported .. note: @@ -208,10 +209,10 @@ def comparison_op(self, col1: Any, col2: Any, op: str) -> Any: def binary_logical_op(self, col1: Any, col2: Any, op: str) -> Any: """Binary logical operations ``and``, ``or`` - :param col1: the first column (series or constant) - :param col2: the second column (series or constant) + :param col1: the first column (series or scalar) + :param col2: the second column (series or scalar) :param op: ``and``, ``or`` - :return: the result after the operation (series or constant) + :return: the result after the operation (series or scalar) :raises NotImplementedError: if ``op`` is not supported .. note: @@ -248,12 +249,12 @@ def cast( # noqa: C901 """Cast ``col`` to a new type. ``type_obj`` must be able to be converted by :func:`~triad.utils.pyarrow.self.to_safe_pa_type`. - :param col: a series or a constant + :param col: a series or a scalar :param type_obj: an objected that can be accepted by :func:`~triad.utils.pyarrow.self.to_safe_pa_type` :param input_type: an objected that is either None or to be accepted by :func:`~triad.utils.pyarrow.self.to_safe_pa_type`, defaults to None. - :return: the new column or constant + :return: the new column or scalar .. note: @@ -338,10 +339,10 @@ def cast( # noqa: C901 raise SlideCastError(str(te)) from te def filter_df(self, df: TDf, cond: Any) -> TDf: - """Filter dataframe by a boolean series or a constant + """Filter dataframe by a boolean series or a scalar :param df: the dataframe - :param cond: a boolean seris or a constant + :param cond: a boolean seris or a scalar :return: the filtered dataframe .. note: @@ -357,9 +358,9 @@ def filter_df(self, df: TDf, cond: Any) -> TDf: return df.head(0) def is_value(self, col: Any, value: Any, positive: bool = True) -> Any: - """Check if the series or constant is ``value`` + """Check if the series or scalar is ``value`` - :param col: the series or constant + :param col: the series or scalar :param value: ``None``, ``True`` or ``False`` :param positive: check ``is value`` or ``is not value``, defaults to True (``is value``) @@ -389,12 +390,12 @@ def is_value(self, col: Any, value: Any, positive: bool = True) -> Any: )[0] def is_in(self, col: Any, values: List[Any], positive: bool) -> Any: # noqa: C901 - """Check if a series or a constant is in ``values`` + """Check if a series or a scalar is in ``values`` - :param col: the series or the constant - :param values: a list of constants and series (can mix) + :param col: the series or the scalar + :param values: a list of scalars and series (can mix) :param positive: ``is in`` or ``is not in`` - :return: the correspondent boolean series or constant + :return: the correspondent boolean series or scalar .. note: @@ -404,13 +405,13 @@ def is_in(self, col: Any, values: List[Any], positive: bool) -> Any: # noqa: C9 if self.is_series(col): cols = [x for x in values if self.is_series(x)] others = [x for x in values if not self.is_series(x)] - has_null_constant = any(pd.isna(x) for x in others) + has_null = any(pd.isna(x) for x in others) innulls: Any = None if positive: o: Any = col.isin(others) for c in cols: o = o | (col == c) - if not has_null_constant: + if not has_null: if innulls is None: innulls = c.isnull() else: @@ -419,12 +420,12 @@ def is_in(self, col: Any, values: List[Any], positive: bool) -> Any: # noqa: C9 o = ~col.isin(others) for c in cols: o = o & (col != c) - if not has_null_constant: + if not has_null: if innulls is None: innulls = c.isnull() else: innulls = innulls | c.isnull() - if has_null_constant: + if has_null: o = o.mask(o == (0 if positive else 1), None) elif innulls is not None: o = o.mask(innulls & (o == (0 if positive else 1)), None) @@ -436,13 +437,13 @@ def is_in(self, col: Any, values: List[Any], positive: bool) -> Any: # noqa: C9 return None if pd.isna(res) else bool(res) def is_between(self, col: Any, lower: Any, upper: Any, positive: bool) -> Any: - """Check if a series or a constant is ``>=lower`` and ``<=upper`` + """Check if a series or a scalar is ``>=lower`` and ``<=upper`` - :param col: the series or the constant - :param lower: the lower bound, which can be series or a constant - :param upper: the upper bound, which can be series or a constant + :param col: the series or the scalar + :param lower: the lower bound, which can be series or a scalar + :param upper: the upper bound, which can be series or a scalar :param positive: ``is between`` or ``is not between`` - :return: the correspondent boolean series or constant + :return: the correspondent boolean series or scalar .. note: @@ -453,12 +454,12 @@ def is_between(self, col: Any, lower: Any, upper: Any, positive: bool) -> Any: return None if self.is_series(col): left = ( - self.to_constant_series(False, col) + self.scalar_to_series(False, col) if lower is None else (lower <= col).fillna(False) ) right = ( - self.to_constant_series(False, col) + self.scalar_to_series(False, col) if upper is None else (col <= upper).fillna(False) ) @@ -490,10 +491,10 @@ def is_between(self, col: Any, lower: Any, upper: Any, positive: bool) -> Any: return None if pd.isna(res) else bool(res) def coalesce(self, cols: List[Any]) -> Any: - """Coalesce multiple series and constants + """Coalesce multiple series and scalars - :param cols: the collection of series and constants in order - :return: the coalesced series or constant + :param cols: the collection of series and scalars in order + :return: the coalesced series or scalar .. note: @@ -511,10 +512,10 @@ def case_when(self, *pairs: Tuple[Any, Any], default: Any = None) -> Any: """SQL ``CASE WHEN`` :param pairs: condition and value pairs, both can be either a - series or a constant + series or a scalar :param default: default value if none of the conditions satisfies, defaults to None - :return: the final series or constant + :return: the final series or scalar .. note: @@ -555,11 +556,11 @@ def like( # noqa: C901 ) -> Any: """SQL ``LIKE`` - :param col: a series or a constant + :param col: a series or a scalar :param expr: a pattern expression :param ignore_case: whether to ignore case, defaults to False :param positive: ``LIKE`` or ``NOT LIKE``, defaults to True - :return: the correspondent boolean series or constant + :return: the correspondent boolean series or scalar .. note: @@ -609,7 +610,7 @@ def like_series(col: TCol) -> TCol: if self.is_series(col): if expr is None: - return self.to_constant_series(float("nan"), col) + return self.scalar_to_series(float("nan"), col) nulls = col.isnull() res = like_series(col) if positive: @@ -621,12 +622,19 @@ def like_series(col: TCol) -> TCol: )[0] return None if pd.isna(res) else bool(res) - def cols_to_df(self, cols: List[Any], names: Optional[List[str]] = None) -> TDf: + def cols_to_df( + self, + cols: List[Any], + names: Optional[List[str]] = None, + reference: Union[TCol, TDf, None] = None, + ) -> TDf: """Construct the dataframe from a list of columns (series) - :param cols: the collection of series or constants, at least one value must + :param cols: the collection of series or scalars, at least one value must be a series :param names: the correspondent column names, defaults to None + :param reference: the reference series of dataframe when all cols are scalars + , defaults to None :return: the dataframe diff --git a/slide_dask/utils.py b/slide_dask/utils.py index aaef79e..a5ab674 100644 --- a/slide_dask/utils.py +++ b/slide_dask/utils.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, List, Optional +from typing import Any, Callable, List, Optional, Union import dask.dataframe as dd import numpy as np @@ -8,6 +8,7 @@ from slide.utils import SlideUtils from triad.utils.assertion import assert_or_throw from triad.utils.pyarrow import to_pandas_dtype +from pandas.api.types import is_object_dtype class DaskUtils(SlideUtils[dd.DataFrame, dd.Series]): @@ -33,38 +34,76 @@ def to_series(self, obj: Any, name: Optional[str] = None) -> dd.Series: def series_to_array(self, col: dd.Series) -> List[Any]: return col.compute().tolist() - def to_constant_series( + def scalar_to_series( self, - constant: Any, - from_series: dd.Series, + scalar: Any, + reference: Union[dd.Series, dd.DataFrame], dtype: Any = None, name: Optional[str] = None, ) -> dd.Series: + if pa.types.is_nested(pa.scalar(scalar).type): + assert_or_throw( + dtype is None or is_object_dtype(dtype), + ValueError( + "for nested scalar type, dtype must be None or dtype(object)" + ), + ) + if self.is_series(reference): + return reference.map(lambda _: scalar, meta=(name, dtype)) + else: + return reference[reference.columns[0]].map( + lambda _: scalar, meta=(name, dtype) + ) if dtype is not None: - return from_series.map(lambda _: constant, meta=(name, dtype)) - tdf = from_series.to_frame() + if self.is_series(reference): + return reference.map(lambda _: scalar, meta=(name, dtype)) + else: + return reference[reference.columns[0]].map( + lambda _: scalar, meta=(name, dtype) + ) + tdf = reference.to_frame() if isinstance(reference, dd.Series) else reference tn = name or "_tmp_" - tdf[tn] = constant + tdf[tn] = scalar return tdf[tn] - def cols_to_df( - self, cols: List[Any], names: Optional[List[str]] = None + def cols_to_df( # noqa: C901 + self, + cols: List[Any], + names: Optional[List[str]] = None, + reference: Union[dd.Series, dd.DataFrame, None] = None, ) -> dd.DataFrame: - assert_or_throw( - any(self.is_series(s) for s in cols), - SlideInvalidOperation("at least one value in cols should be series"), - ) + _cols = list(cols) + _ref: Any = None + _nested: List[int] = [] + _ref_idx = -1 + for i in range(len(cols)): + if self.is_series(_cols[i]): + if _ref is None: + _ref = _cols[i] + _ref_idx = i + elif pa.types.is_nested(pa.scalar(_cols[i]).type): + _nested.append(i) + if _ref is None: + assert_or_throw( + reference is not None, + SlideInvalidOperation( + "reference can't be null when all cols are scalars" + ), + ) + _cols[0] = self.scalar_to_series(_cols[0], reference=reference) + _ref = _cols[0] + _ref_idx = 0 + for n in _nested: + if not self.is_series(_cols[n]): + _cols[n] = self.scalar_to_series(_cols[n], reference=_ref) if names is None: col_names: List[str] = [c.name for c in cols] else: col_names = names - for i in range(len(cols)): - if self.is_series(cols[i]): - break - tdf = cols[i].to_frame(col_names[i]) - for j in range(len(cols)): - if i != j: - tdf[col_names[j]] = cols[j] + tdf = _ref.to_frame(col_names[_ref_idx]) + for j in range(len(_cols)): + if _ref_idx != j: + tdf[col_names[j]] = _cols[j] return tdf[col_names] def is_compatile_index(self, df: dd.DataFrame) -> bool: diff --git a/slide_pandas/__init__.py b/slide_pandas/__init__.py index e69de29..ca74f5f 100644 --- a/slide_pandas/__init__.py +++ b/slide_pandas/__init__.py @@ -0,0 +1,2 @@ +# flake8: noqa +from slide_pandas.utils import PandasUtils diff --git a/slide_pandas/utils.py b/slide_pandas/utils.py index 1371826..e079d94 100644 --- a/slide_pandas/utils.py +++ b/slide_pandas/utils.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import pandas as pd @@ -32,25 +32,59 @@ def to_series(self, obj: Any, name: Optional[str] = None) -> pd.Series: def series_to_array(self, col: pd.Series) -> List[Any]: return col.tolist() - def to_constant_series( + def scalar_to_series( self, - constant: Any, - from_series: pd.Series, + scalar: Any, + reference: Union[pd.Series, pd.DataFrame], dtype: Any = None, name: Optional[str] = None, ) -> pd.Series: - return pd.Series(constant, index=from_series.index, dtype=dtype, name=name) + if pa.types.is_nested(pa.scalar(scalar).type): + assert_or_throw( + dtype is None or is_object_dtype(dtype), + ValueError( + "for nested scalar type, dtype must be None or dtype(object)" + ), + ) + if self.is_series(reference): + res = reference.map(lambda _: scalar) + else: + res = reference[reference.columns[0]].map(lambda _: scalar) + if name is not None: + res = res.rename(name) + return res + return pd.Series(scalar, index=reference.index, dtype=dtype, name=name) def cols_to_df( - self, cols: List[pd.Series], names: Optional[List[str]] = None + self, + cols: List[Any], + names: Optional[List[str]] = None, + reference: Union[pd.Series, pd.DataFrame, None] = None, ) -> pd.DataFrame: - assert_or_throw( - any(self.is_series(s) for s in cols), - SlideInvalidOperation("at least one value in cols should be series"), - ) + _cols = list(cols) + _ref: Any = None + _nested: List[int] = [] + for i in range(len(cols)): + if self.is_series(_cols[i]): + if _ref is None: + _ref = _cols[i] + elif pa.types.is_nested(pa.scalar(_cols[i]).type): + _nested.append(i) + if _ref is None: + assert_or_throw( + reference is not None, + SlideInvalidOperation( + "reference can't be null when all cols are scalars" + ), + ) + _cols[0] = self.scalar_to_series(_cols[0], reference=reference) + _ref = _cols[0] + for n in _nested: + if not self.is_series(_cols[n]): + _cols[n] = self.scalar_to_series(_cols[n], reference=_ref) if names is None: - return pd.DataFrame({c.name: c for c in cols}) - return pd.DataFrame(dict(zip(names, cols))) + return pd.DataFrame({c.name: c for c in _cols}) + return pd.DataFrame(dict(zip(names, _cols))) def as_pandas(self, df: pd.DataFrame) -> pd.DataFrame: return df diff --git a/slide_test/suite.py b/slide_test/suite.py index 9c1612c..78dcf53 100644 --- a/slide_test/suite.py +++ b/slide_test/suite.py @@ -92,16 +92,45 @@ def test_to_series(self): pd.DataFrame(dict(x=[0, 1], y=[2, 3], z=[4, 5], w=[2, 3])), ) - def test_to_constant_series(self): + def test_scalar_to_series(self): s = self.utils.to_series(pd.Series([0, 1], name="x")) - s1 = self.utils.to_constant_series("a", s, name="y") - s2 = self.utils.to_constant_series(None, s, name="z", dtype="float64") + s1 = self.utils.scalar_to_series("a", s, name="y") + s2 = self.utils.scalar_to_series(None, s, name="z", dtype="float64") df = self.utils.cols_to_df([s, s1, s2]) assert_pdf_eq( self.to_pd(df), pd.DataFrame(dict(x=[0, 1], y=["a", "a"], z=[None, None])), ) + s = self.to_df(pd.DataFrame(dict(x=pd.Series([0, 1])))) + s1 = self.utils.scalar_to_series("a", s, name="y") + s2 = self.utils.scalar_to_series(None, s, name="z", dtype="float64") + df = self.utils.cols_to_df([s1, s2]) + assert_pdf_eq( + self.to_pd(df), + pd.DataFrame(dict(y=["a", "a"], z=[None, None])), + ) + + s = self.utils.to_series(pd.Series([0, 1], name="x")) + s1 = self.utils.scalar_to_series({"x": 1}, s, name="y") + s2 = self.utils.scalar_to_series( + [1, 2], s, name="z", dtype=np.dtype(object) + ) + df = self.utils.cols_to_df([s1, s2]) + assert [[{"x": 1}, [1, 2]], [{"x": 1}, [1, 2]]] == self.to_pd( + df + ).values.tolist() + + s = self.to_df(pd.DataFrame(dict(x=pd.Series([0, 1])))) + s1 = self.utils.scalar_to_series( + {"x": 1}, s, name="y", dtype=np.dtype(object) + ) + s2 = self.utils.scalar_to_series([1, 2], s, name="z") + df = self.utils.cols_to_df([s1, s2]) + assert [[{"x": 1}, [1, 2]], [{"x": 1}, [1, 2]]] == self.to_pd( + df + ).values.tolist() + def test_get_col_pa_type(self): df = self.to_df( [["a", 1, 1.1, True, datetime.now()]], @@ -1115,7 +1144,7 @@ def test_like_sql(self): check_order=False, ) - def test_cast_constant(self): + def test_cast_scalar(self): assert self.utils.cast(None, bool) is None assert self.utils.cast(True, bool) assert not self.utils.cast(False, bool) @@ -1885,6 +1914,25 @@ def test_cols_to_df(self): with raises(SlideInvalidOperation): res = self.utils.cols_to_df([123, 456], names=["x", "y"]) + res = self.utils.cols_to_df([123, 456], names=["x", "y"], reference=df) + assert_pdf_eq( + self.to_pd(res), self.to_pd(self.to_df([[123, 456]], "x:long,y:long")) + ) + + # has nested type, all scalars + res = self.utils.cols_to_df([[1, 2], 456], names=["x", "y"], reference=df) + assert [[[1, 2], 456]] == self.to_pd(res).values.tolist() + + res = self.utils.cols_to_df([456, [1, 2]], names=["x", "y"], reference=df) + assert [[456, [1, 2]]] == self.to_pd(res).values.tolist() + + # has nested type, and series + res = self.utils.cols_to_df([[1, 2], df["a"]], names=["x", "y"]) + assert [[[1, 2], "a"]] == self.to_pd(res).values.tolist() + + res = self.utils.cols_to_df([df["a"], [1, 2]], names=["x", "y"]) + assert [["a", [1, 2]]] == self.to_pd(res).values.tolist() + def test_to_schema(self): df = self.to_df([[1.0, 2], [2.0, 3]]) raises(ValueError, lambda: self.utils.to_schema(df)) diff --git a/tests/slide/operators/test_select_operators.py b/tests/slide/operators/test_select_operators.py new file mode 100644 index 0000000..8450c03 --- /dev/null +++ b/tests/slide/operators/test_select_operators.py @@ -0,0 +1,141 @@ +import pandas as pd +from slide.operators.select_operators import SelectExecutionContext, SelectExecutionPlan +from slide_pandas import PandasUtils +from slide_test.utils import assert_duck_eq, assert_pdf_eq +from triad import Schema +import pyarrow as pa +from pytest import raises + + +def test_col_op(): + pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) + ctx = SelectExecutionContext(PandasUtils(), pdf) + plan = SelectExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) + col0 = plan.col("c") + assert pa.bool_() == col0.output_type + assert "c" == col0.output_name + col1 = plan.col("a") + assert pa.uint32() == col1.output_type + assert "a" == col1.output_name + col2 = plan.col("b") + assert pa.float32() == col2.output_type + assert "b" == col2.output_name + + plan.output(col0, col1, col2) + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT c, a, b FROM a", + a=pdf, + check_order=False, + ) + + +def test_lit_op(): + pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) + ctx = SelectExecutionContext(PandasUtils(), pdf) + plan = SelectExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) + col0 = plan.lit(None) + assert pa.null() == col0.output_type + col1 = plan.lit("abc") + assert pa.string() == col1.output_type + col2 = plan.lit(1, pa.uint8()) + assert pa.uint8() == col2.output_type + col3 = plan.col("a") + + plan.output((col1, "x"), (col2, "y"), col3) + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT 'abc' AS x, 1 AS y, a FROM a", + a=pdf, + check_order=False, + ) + + +def test_pure_lit_op(): + pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) + ctx = SelectExecutionContext(PandasUtils(), pdf) + plan = SelectExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) + col0 = plan.lit(None) + assert pa.null() == col0.output_type + col1 = plan.lit("abc") + assert pa.string() == col1.output_type + col2 = plan.lit(1, pa.uint8()) + assert pa.uint8() == col2.output_type + col3 = plan.lit(b"\0abc") + assert pa.binary() == col3.output_type + col4 = plan.lit([1, 2]) + assert pa.types.is_nested(col4.output_type) + + plan.output((col1, "a"), (col2, "b"), (col3, "c"), (col4, "d")) + plan.execute(ctx) + + expected = [["abc", 1, b"\0abc", [1, 2]], ["abc", 1, b"\0abc", [1, 2]]] + + assert expected == ctx.output.astype(object).values.tolist() + + +def test_unary_op(): + pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) + ctx = SelectExecutionContext(PandasUtils(), pdf) + plan = SelectExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) + col0 = plan.col("c") + col1 = plan.col("a") + col2 = plan.col("b") + col3 = plan.unary("+", col1) + assert pa.uint32() == col3.output_type + assert "a" == col3.output_name + col4 = plan.unary("-", col1) + assert pa.int64() == col4.output_type + col5 = plan.unary("+", col2) + assert pa.float32() == col5.output_type + col6 = plan.unary("-", col2) + assert pa.float32() == col6.output_type + + raises(ValueError, lambda: plan.unary("-", col0)) + raises(ValueError, lambda: plan.unary("+", col0)) + raises(ValueError, lambda: plan.unary("~", col1)) + raises(ValueError, lambda: plan.unary("~", col2)) + + col10 = plan.unary("~", col0) + plan.output((col3, "c3"), (col4, "c4"), (col5, "c5"), (col6, "c6"), (col10, "c10")) + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + """ + SELECT + a AS c3, -a AS c4, + b AS c5, -b AS c6, + NOT c AS c10 + FROM a + """, + a=pdf, + check_order=False, + ) + + +def test_plan(): + pdf = pd.DataFrame([[0, 1.1], [3, 4.1]], columns=["a", "b"]) + ctx = SelectExecutionContext(PandasUtils(), pdf) + plan = SelectExecutionPlan(Schema("a:int,b:float").pa_schema) + col1 = plan.col("b") + col2 = plan.col("a") + col3 = plan.binary("+", col1, col2) + col4 = plan.binary("-", col3, plan.lit(2)) + l1 = len(plan) + col5 = plan.binary("+", col1, col2) # dedupped + assert l1 == len(plan) + col6 = plan.unary("-", col5) + # a, b, a+b as x, a+b-2 as y, -(a+b) as z + plan.output(col1, col2, (col3, "x"), (col4, "y"), (col6, "z")) + plan.execute(ctx) + assert_duck_eq( + ctx.output, + "SELECT a, b, a+b AS x, a+b-2 AS y, -(a+b) AS z FROM a", + a=pdf, + check_order=False, + ) From 7240f1f9a6099b07fc82afb9975dd8fdc06ca71e Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 13 Dec 2021 02:07:41 +0000 Subject: [PATCH 4/7] map operators --- .../{select_operators.py => map_operators.py} | 80 ++++--- slide/utils.py | 12 +- slide_test/suite.py | 10 +- .../slide/operators/test_select_operators.py | 216 +++++++++++++++++- 4 files changed, 265 insertions(+), 53 deletions(-) rename slide/operators/{select_operators.py => map_operators.py} (75%) diff --git a/slide/operators/select_operators.py b/slide/operators/map_operators.py similarity index 75% rename from slide/operators/select_operators.py rename to slide/operators/map_operators.py index 45e39b8..980e0d8 100644 --- a/slide/operators/select_operators.py +++ b/slide/operators/map_operators.py @@ -1,11 +1,12 @@ from typing import Any, Dict, List, Optional, Tuple, Union import pyarrow as pa +import pandas as pd from slide.utils import SlideUtils from triad import Schema, to_uuid -class SelectOperator: +class MapOperator: def __init__(self, *args: Any, **kwargs: Any): self._args = args self._kwargs = kwargs @@ -20,7 +21,7 @@ def identifier(self) -> str: def key(self) -> str: return "_" + to_uuid(self)[:8] - def execute(self, context: "SelectExecutionContext") -> None: + def execute(self, context: "MapOperationsContext") -> None: raise NotImplementedError # pragma: no cover @property @@ -35,7 +36,7 @@ def __uuid__(self) -> str: return self._uuid -class GetColumn(SelectOperator): +class GetColumn(MapOperator): def __init__(self, name: str, input_type: pa.DataType): super().__init__(name) self._name = name @@ -49,11 +50,11 @@ def output_type(self) -> pa.DataType: def output_name(self) -> Optional[str]: return self._name - def execute(self, context: "SelectExecutionContext") -> None: + def execute(self, context: "MapOperationsContext") -> None: context[self] = context.df[self._name] -class LitColumn(SelectOperator): +class LitColumn(MapOperator): def __init__(self, value: Any, input_type: Optional[pa.DataType] = None): super().__init__(value) self._value = value @@ -63,12 +64,12 @@ def __init__(self, value: Any, input_type: Optional[pa.DataType] = None): def output_type(self) -> pa.DataType: return self._output_type - def execute(self, context: "SelectExecutionContext") -> None: + def execute(self, context: "MapOperationsContext") -> None: context[self] = self._value -class UnaryOperator(SelectOperator): - def __init__(self, op: str, col: SelectOperator): +class UnaryOperator(MapOperator): + def __init__(self, op: str, col: MapOperator): super().__init__(op, col) self._op = op self._col = col @@ -82,7 +83,7 @@ def output_type(self) -> pa.DataType: def output_name(self) -> Optional[str]: return self._col.output_name - def execute(self, context: "SelectExecutionContext") -> None: + def execute(self, context: "MapOperationsContext") -> None: if self._op in ["+", "-"]: context[self] = context.utils.unary_arithmetic_op( context[self._col], op=self._op @@ -107,8 +108,8 @@ def _get_output_type(self, op: str, input_type: pa.DataType) -> pa.DataType: raise ValueError(f"'{op}' can't be applied to {input_type}") -class BinaryOperator(SelectOperator): - def __init__(self, op: str, col1: SelectOperator, col2: SelectOperator): +class BinaryOperator(MapOperator): + def __init__(self, op: str, col1: MapOperator, col2: MapOperator): super().__init__(op, col1, col2) self._op = op self._col1 = col1 @@ -121,11 +122,18 @@ def __init__(self, op: str, col1: SelectOperator, col2: SelectOperator): def output_type(self) -> pa.DataType: return self._output_type - def execute(self, context: "SelectExecutionContext") -> None: + def execute(self, context: "MapOperationsContext") -> None: if self._op in ["+", "-", "*", "/"]: - context[self] = context.utils.binary_arithmetic_op( + res = context.utils.binary_arithmetic_op( context[self._col1], context[self._col2], op=self._op ) + if ( # int/int -> int + pa.types.is_integer(self._col1.output_type) + and pa.types.is_integer(self._col2.output_type) + and not pd.api.types.is_integer_dtype(res.dtype) + ): + res = context.utils.cast(res, "int64") + context[self] = res elif self._op in ["&", "|"]: context[self] = context.utils.binary_logical_op( context[self._col1], @@ -172,18 +180,20 @@ def _get_output_type( # noqa: C901 if (pa.types.is_boolean(t1) or pa.types.is_null(t1)) and ( pa.types.is_boolean(t2) or pa.types.is_null(t2) ): - return pa.boolean() - raise ValueError(f"'{op}' can't be applied to {t1} and {t2}") + return pa.bool_() + raise ValueError( # pragma: no cover + f"'{op}' can't be applied to {t1} and {t2}" + ) -class OutputOperator(SelectOperator): - def __init__(self, *args: Union[SelectOperator, Tuple[SelectOperator, str]]): +class MapOutputOperator(MapOperator): + def __init__(self, *args: Union[MapOperator, Tuple[MapOperator, str]]): self._data: List[Any] = [ - (x, x.output_name) if isinstance(x, SelectOperator) else x for x in args + (x, x.output_name) if isinstance(x, MapOperator) else x for x in args ] super().__init__(*self._data) - def execute(self, context: "SelectExecutionContext") -> None: + def execute(self, context: "MapOperationsContext") -> None: cols = [context[x] for x, _ in self._data] names = [y for _, y in self._data] context.set_output( @@ -191,7 +201,7 @@ def execute(self, context: "SelectExecutionContext") -> None: ) -class SelectExecutionContext: +class MapOperationsContext: def __init__(self, utils: SlideUtils, df: Any): self._utils = utils self._df = df @@ -213,20 +223,20 @@ def output(self) -> Any: def set_output(self, df: Any) -> None: self._output = df - def __setitem__(self, op: SelectOperator, value: Any) -> None: + def __setitem__(self, op: MapOperator, value: Any) -> None: self._results[op.key] = value - def __getitem__(self, op: SelectOperator) -> None: + def __getitem__(self, op: MapOperator) -> None: return self._results[op.key] -class SelectExecutionPlan: +class MapExecutionPlan: def __init__(self, input_schema: pa.Schema): self._input_schema = input_schema - self._steps: List[SelectOperator] = [] - self._steps_dict: Dict[str, SelectOperator] = {} + self._steps: List[MapOperator] = [] + self._steps_dict: Dict[str, MapOperator] = {} - def add(self, op: SelectOperator) -> SelectOperator: + def add(self, op: MapOperator) -> MapOperator: key = op.key if key in self._steps_dict: return self._steps_dict[key] @@ -234,24 +244,20 @@ def add(self, op: SelectOperator) -> SelectOperator: self._steps.append(op) return op - def col(self, name: str) -> SelectOperator: + def col(self, name: str) -> MapOperator: return self.add(GetColumn(name, self._input_schema.field_by_name(name).type)) - def lit( - self, value: Any, input_type: Optional[pa.DataType] = None - ) -> SelectOperator: + def lit(self, value: Any, input_type: Optional[pa.DataType] = None) -> MapOperator: return self.add(LitColumn(value, input_type)) - def unary(self, op: str, col: SelectOperator) -> SelectOperator: + def unary(self, op: str, col: MapOperator) -> MapOperator: return self.add(UnaryOperator(op, col)) - def binary( - self, op: str, col1: SelectOperator, col2: SelectOperator - ) -> SelectOperator: + def binary(self, op: str, col1: MapOperator, col2: MapOperator) -> MapOperator: return self.add(BinaryOperator(op, col1, col2)) - def output(self, *args: Union[SelectOperator, Tuple[SelectOperator, str]]) -> None: - self.add(OutputOperator(*args)) + def output(self, *args: Union[MapOperator, Tuple[MapOperator, str]]) -> None: + self.add(MapOutputOperator(*args)) def __len__(self) -> int: return len(self._steps) @@ -259,6 +265,6 @@ def __len__(self) -> int: def __uuid__(self) -> str: return to_uuid(str(Schema(self._input_schema)), self._steps) - def execute(self, context: SelectExecutionContext) -> None: + def execute(self, context: MapOperationsContext) -> None: for step in self._steps: step.execute(context) diff --git a/slide/utils.py b/slide/utils.py index aee86d5..ea319b2 100644 --- a/slide/utils.py +++ b/slide/utils.py @@ -171,6 +171,10 @@ def binary_arithmetic_op(self, col1: Any, col2: Any, op: str) -> Any: if op == "*": return col1 * col2 if op == "/": + # for int/int, we should force the output to int + # but int columns can be in non-int types so it is impossible + # to judge from the dtypes of the input, so the logic using + # this function should be responsible to do this check return col1 / col2 raise NotImplementedError(f"{op} is not supported") # pragma: no cover @@ -223,12 +227,16 @@ def binary_logical_op(self, col1: Any, col2: Any, op: str) -> Any: c2 = self._safe_bool(col2) if op == "and": if not self.is_series(c1) and not self.is_series(c2): + if c1 is None: + return c2 and c1 return c1 and c2 - return c1 & c2 + return (pd.NA if c1 is None else c1) & (pd.NA if c2 is None else c2) elif op == "or": if not self.is_series(c1) and not self.is_series(c2): + if c1 is None: + return c2 or c1 return c1 or c2 - return c1 | c2 + return (pd.NA if c1 is None else c1) | (pd.NA if c2 is None else c2) raise NotImplementedError(f"{op} is not supported") # pragma: no cover def logical_not(self, col: Any) -> Any: diff --git a/slide_test/suite.py b/slide_test/suite.py index 78dcf53..d6cb35b 100644 --- a/slide_test/suite.py +++ b/slide_test/suite.py @@ -334,14 +334,20 @@ def test_(pdf: pd.DataFrame, op: str): df["j"] = self.utils.binary_logical_op(True, None, op) df["k"] = self.utils.binary_logical_op(False, None, op) df["l"] = self.utils.binary_logical_op(None, None, op) + df["m"] = self.utils.binary_logical_op(None, True, op) + df["n"] = self.utils.binary_logical_op(None, False, op) + df["o"] = self.utils.binary_logical_op(df.a, None, op) + df["p"] = self.utils.binary_logical_op(None, df.b, op) assert_duck_eq( - self.to_pd(df[list("defghijkl")]), + self.to_pd(df[list("defghijklmnop")]), f""" SELECT a {op} b AS d, a {op} TRUE AS e, TRUE {op} b AS f, a {op} FALSE AS g, FALSE {op} b AS h, TRUE {op} FALSE AS i, - TRUE {op} NULL AS j, FALSE {op} NULL AS k, NULL {op} NULL AS l + TRUE {op} NULL AS j, FALSE {op} NULL AS k, NULL {op} NULL AS l, + NULL {op} TRUE AS m, NULL {op} FALSE AS n, + a {op} NULL AS o, NULL {op} b AS p FROM pdf """, pdf=pdf, diff --git a/tests/slide/operators/test_select_operators.py b/tests/slide/operators/test_select_operators.py index 8450c03..49ec88b 100644 --- a/tests/slide/operators/test_select_operators.py +++ b/tests/slide/operators/test_select_operators.py @@ -1,16 +1,16 @@ import pandas as pd -from slide.operators.select_operators import SelectExecutionContext, SelectExecutionPlan +from slide.operators.map_operators import MapOperationsContext, MapExecutionPlan from slide_pandas import PandasUtils from slide_test.utils import assert_duck_eq, assert_pdf_eq -from triad import Schema +from triad import Schema, to_uuid import pyarrow as pa from pytest import raises def test_col_op(): pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) - ctx = SelectExecutionContext(PandasUtils(), pdf) - plan = SelectExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) + ctx = MapOperationsContext(PandasUtils(), pdf) + plan = MapExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) col0 = plan.col("c") assert pa.bool_() == col0.output_type assert "c" == col0.output_name @@ -34,8 +34,8 @@ def test_col_op(): def test_lit_op(): pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) - ctx = SelectExecutionContext(PandasUtils(), pdf) - plan = SelectExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) + ctx = MapOperationsContext(PandasUtils(), pdf) + plan = MapExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) col0 = plan.lit(None) assert pa.null() == col0.output_type col1 = plan.lit("abc") @@ -57,8 +57,8 @@ def test_lit_op(): def test_pure_lit_op(): pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) - ctx = SelectExecutionContext(PandasUtils(), pdf) - plan = SelectExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) + ctx = MapOperationsContext(PandasUtils(), pdf) + plan = MapExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) col0 = plan.lit(None) assert pa.null() == col0.output_type col1 = plan.lit("abc") @@ -80,8 +80,8 @@ def test_pure_lit_op(): def test_unary_op(): pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) - ctx = SelectExecutionContext(PandasUtils(), pdf) - plan = SelectExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) + ctx = MapOperationsContext(PandasUtils(), pdf) + plan = MapExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) col0 = plan.col("c") col1 = plan.col("a") col2 = plan.col("b") @@ -118,10 +118,183 @@ def test_unary_op(): ) +def test_binary_op_num(): + pdf = pd.DataFrame([[1, 1.1], [3, 4.1]], columns=["a", "b"]) + ctx = MapOperationsContext(PandasUtils(), pdf) + plan = MapExecutionPlan(Schema("a:uint,b:float32").pa_schema) + col1 = plan.col("a") + col2 = plan.col("b") + cola = plan.binary("+", col1, col1) + assert pa.int64() == cola.output_type + colb = plan.binary("-", col1, col1) + assert pa.int64() == colb.output_type + colc = plan.binary("*", col1, col1) + assert pa.int64() == colc.output_type + cold = plan.binary("/", col1, plan.lit(2)) + assert pa.int64() == cold.output_type + + cole = plan.binary("+", col1, col2) + assert pa.float64() == cole.output_type + colf = plan.binary("-", col1, col2) + assert pa.float64() == colf.output_type + colg = plan.binary("*", col1, col2) + assert pa.float64() == colg.output_type + colh = plan.binary("/", col1, col2) + assert pa.float64() == colh.output_type + + coli = plan.binary("+", col2, col1) + assert pa.float64() == coli.output_type + colj = plan.binary("-", col2, col1) + assert pa.float64() == colj.output_type + colk = plan.binary("*", col2, col1) + assert pa.float64() == colk.output_type + coll = plan.binary("/", col2, col1) + assert pa.float64() == coll.output_type + + colm = plan.binary("+", col2, col2) + assert pa.float64() == colm.output_type + coln = plan.binary("-", col2, col2) + assert pa.float64() == coln.output_type + colo = plan.binary("*", col2, col2) + assert pa.float64() == colo.output_type + colp = plan.binary("/", col2, col2) + assert pa.float64() == colp.output_type + + plan.output( + (cola, "a"), + (colb, "b"), + (colc, "c"), + (cold, "d"), + (cole, "e"), + (colf, "f"), + (colg, "g"), + (colh, "h"), + (coli, "i"), + (colj, "j"), + (colk, "k"), + (coll, "l"), + (colm, "m"), + (coln, "n"), + (colo, "o"), + (colp, "p"), + ) + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + """ + SELECT + a+a AS a, a-a AS b, a*a AS c, a/2 AS d, + a+b AS e, a-b AS f, a*b AS g, a/b AS h, + b+a AS i, b-a AS j, b*a AS k, b/a AS l, + b+b AS m, b-b AS n, b*b AS o, b/b AS p + FROM a + """, + a=pdf, + check_order=False, + ) + + +def test_binary_op_logical(): + pdf = pd.DataFrame( + [ + [True, True], + [True, False], + [True, None], + [False, True], + [False, False], + [False, None], + [None, True], + [None, False], + [None, None], + ], + columns=["a", "b"], + ) + ctx = MapOperationsContext(PandasUtils(), pdf) + plan = MapExecutionPlan(Schema("a:bool,b:bool").pa_schema) + col1 = plan.col("a") + col2 = plan.col("b") + cola = plan.binary("&", col1, col2) + assert pa.bool_() == cola.output_type + colb = plan.binary("|", col1, col2) + assert pa.bool_() == colb.output_type + colc = plan.binary("&", col1, plan.lit(True)) + assert pa.bool_() == colc.output_type + cold = plan.binary("&", col1, plan.lit(False)) + assert pa.bool_() == cold.output_type + cole = plan.binary("&", col1, plan.lit(None)) + assert pa.bool_() == cole.output_type + colf = plan.binary("|", col1, plan.lit(True)) + assert pa.bool_() == colf.output_type + colg = plan.binary("|", col1, plan.lit(False)) + assert pa.bool_() == colg.output_type + colh = plan.binary("|", col1, plan.lit(None)) + assert pa.bool_() == colh.output_type + + plan.output( + (cola, "a"), + (colb, "b"), + (colc, "c"), + (cold, "d"), + (cole, "e"), + (colf, "f"), + (colg, "g"), + (colh, "h"), + ) + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + """ + SELECT + a AND b AS a, a OR b AS b, + a AND TRUE AS c, a AND FALSE AS d, a AND NULL AS e, + a OR TRUE AS f, a OR FALSE AS g, a OR NULL AS h + FROM a + """, + a=pdf, + check_order=False, + ) + + +def test_binary_op_logical_2(): + pdf = pd.DataFrame( + [ + [True, True], + [True, False], + ], + columns=["a", "b"], + ) + ctx = MapOperationsContext(PandasUtils(), pdf) + plan = MapExecutionPlan(Schema("a:bool,b:bool").pa_schema) + output = [] + sql = [] + n = 0 + for op in ["&", "|"]: + for left in [True, False, None]: + for right in [True, False, None]: + name = f"_{n}" + col = plan.binary(op, plan.lit(left), plan.lit(right)) + assert pa.bool_() == col.output_type + output.append((col, name)) + ls = "NULL" if left is None else str(left).upper() + rs = "NULL" if right is None else str(right).upper() + o = "AND" if op == "&" else "OR" + sql.append(f"{ls} {o} {rs} AS {name}") + n += 1 + plan.output(*output) + plan.execute(ctx) + + _sql = ", ".join(sql) + assert_duck_eq( + ctx.output, f"SELECT {_sql} FROM a", a=pdf, check_order=False, debug=True + ) + + def test_plan(): pdf = pd.DataFrame([[0, 1.1], [3, 4.1]], columns=["a", "b"]) - ctx = SelectExecutionContext(PandasUtils(), pdf) - plan = SelectExecutionPlan(Schema("a:int,b:float").pa_schema) + ctx = MapOperationsContext(PandasUtils(), pdf) + plan = MapExecutionPlan(Schema("a:int,b:float").pa_schema) col1 = plan.col("b") col2 = plan.col("a") col3 = plan.binary("+", col1, col2) @@ -139,3 +312,22 @@ def test_plan(): a=pdf, check_order=False, ) + + +def test_plan_uuid(): + plan1 = MapExecutionPlan(Schema("a:int,b:float32").pa_schema) + plan2 = MapExecutionPlan(Schema("a:int,b:float64").pa_schema) + plan3 = MapExecutionPlan(Schema("a:int,b:float64").pa_schema) + assert to_uuid(plan1) != to_uuid(plan2) + assert to_uuid(plan3) == to_uuid(plan2) + + plan2.col("a") + plan3.col("b") + tid = to_uuid(plan2) + assert to_uuid(plan3) != to_uuid(plan2) + + plan2.col("a") + assert tid == to_uuid(plan2) + plan2.col("b") + assert tid != to_uuid(plan2) + From 6f0e0c09178c1e6641446f99e1ce00ce3a7d68e8 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 14 Dec 2021 06:50:56 +0000 Subject: [PATCH 5/7] add operators --- slide/operators/df_operators.py | 73 ++++++++++++ slide/operators/graph.py | 89 ++++++++++++++ slide/operators/map_operators.py | 111 ++++++------------ tests/slide/operators/test_df_operators.py | 34 ++++++ ...ect_operators.py => test_map_operators.py} | 6 +- 5 files changed, 236 insertions(+), 77 deletions(-) create mode 100644 slide/operators/df_operators.py create mode 100644 slide/operators/graph.py create mode 100644 tests/slide/operators/test_df_operators.py rename tests/slide/operators/{test_select_operators.py => test_map_operators.py} (98%) diff --git a/slide/operators/df_operators.py b/slide/operators/df_operators.py new file mode 100644 index 0000000..44b127a --- /dev/null +++ b/slide/operators/df_operators.py @@ -0,0 +1,73 @@ +from typing import Any, Callable, Optional + +import pyarrow as pa +from slide.operators.graph import Context, Graph, Operator +from slide.operators.map_operators import MapExecutionPlan, MapOperationsContext + + +class DataFrameOperator(Operator): + @property + def output_schema(self) -> pa.Schema: + raise NotImplementedError # pragma: no cover + + +class GetDataFrameOperator(DataFrameOperator): + def __init__(self, df: Any, input_schema: pa.Schema): + super().__init__(id(df), str(input_schema)) + self._df = df + self._schema = input_schema + + @property + def output_schema(self) -> pa.Schema: + return self._schema + + def execute(self, context: Context) -> None: + context[self] = self._df + + +class SelectOperator(DataFrameOperator): + def __init__( + self, df: DataFrameOperator, builder: Callable[[MapExecutionPlan], None] + ): + self._plan = MapExecutionPlan(df.output_schema) + builder(self._plan) + self._output_schema = self._plan.output_schema + self._df = df + super().__init__(df, self._plan) + + @property + def output_schema(self) -> pa.Schema: + return self._output_schema + + def execute(self, context: Context) -> None: + indf = context[self._df] + ctx = MapOperationsContext(context.utils, indf) + self._plan.execute(ctx) + context[self] = ctx.output + + +class OutputDataFrameOperator(DataFrameOperator): + def __init__(self, df: DataFrameOperator): + super().__init__(df) + self._df = df + + def execute(self, context: Context) -> None: + context.set_output(context[self._df]) + + +class ExecutionPlan(Graph): + def __init__(self): + super().__init__() + self._output_schema: Optional[pa.Schema] = None + + def df(self, df: Any, input_schema: pa.Schema) -> Operator: + return self.add(GetDataFrameOperator(df, input_schema)) + + def select( + self, df: DataFrameOperator, builder: Callable[[MapExecutionPlan], None] + ) -> Operator: + return self.add(SelectOperator(df, builder)) + + def output(self, df: DataFrameOperator) -> None: + self.add(OutputDataFrameOperator(df)) + self.set_output_schema(df.output_schema) diff --git a/slide/operators/graph.py b/slide/operators/graph.py new file mode 100644 index 0000000..e4fe9ee --- /dev/null +++ b/slide/operators/graph.py @@ -0,0 +1,89 @@ +from typing import Any, Dict, List, Optional + +import pyarrow as pa +from slide.exceptions import SlideInvalidOperation +from slide.utils import SlideUtils +from triad import assert_or_throw, to_uuid + + +class Operator: + def __init__(self, *args: Any, **kwargs: Any): + self._uuid = to_uuid(self.identifier, args, kwargs) + pass + + @property + def identifier(self) -> str: + return str(type(self)) + + @property + def key(self) -> str: + return "_" + to_uuid(self)[:8] + + def __uuid__(self) -> str: + return self._uuid + + def execute(self, context: "Context") -> None: + raise NotImplementedError # pragma: no cover + + +class Context: + def __init__(self, utils: SlideUtils): + self._utils = utils + self._output: Any = None + self._results: Dict[str, Any] = {} + + @property + def utils(self) -> SlideUtils: + return self._utils + + @property + def output(self) -> Any: + return self._output + + def set_output(self, df: Any) -> None: + self._output = df + + def __setitem__(self, op: Operator, value: Any) -> None: + self._results[op.key] = value + + def __getitem__(self, op: Operator) -> None: + return self._results[op.key] + + +class Graph: + def __init__(self): + self._steps: List[Operator] = [] + self._steps_dict: Dict[str, Operator] = {} + self._output_schema: Optional[pa.Schema] = None + + @property + def output_schema(self) -> pa.Schema: + assert_or_throw( + self._output_schema is not None, SlideInvalidOperation("output is not set") + ) + return self._output_schema + + def set_output_schema(self, schema: pa.Schema) -> None: + assert_or_throw( + self._output_schema is None, SlideInvalidOperation("output is already set") + ) + self._output_schema = schema + + def add(self, op: Operator) -> Operator: + key = op.key + if key in self._steps_dict: + return self._steps_dict[key] + self._steps_dict[key] = op + self._steps.append(op) + return op + + @property + def steps(self) -> List[Operator]: + return self._steps + + def __len__(self) -> int: + return len(self._steps) + + def execute(self, context: Context) -> None: + for step in self.steps: + step.execute(context) diff --git a/slide/operators/map_operators.py b/slide/operators/map_operators.py index 980e0d8..e42b1b1 100644 --- a/slide/operators/map_operators.py +++ b/slide/operators/map_operators.py @@ -1,29 +1,14 @@ -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union -import pyarrow as pa import pandas as pd +import pyarrow as pa + +from slide.operators.graph import Context, Graph, Operator from slide.utils import SlideUtils from triad import Schema, to_uuid -class MapOperator: - def __init__(self, *args: Any, **kwargs: Any): - self._args = args - self._kwargs = kwargs - self._uuid = to_uuid(self.identifier, self._args, self._kwargs) - pass - - @property - def identifier(self) -> str: - return str(type(self)) - - @property - def key(self) -> str: - return "_" + to_uuid(self)[:8] - - def execute(self, context: "MapOperationsContext") -> None: - raise NotImplementedError # pragma: no cover - +class MapOperator(Operator): @property def output_type(self) -> pa.DataType: raise NotImplementedError # pragma: no cover @@ -32,13 +17,17 @@ def output_type(self) -> pa.DataType: def output_name(self) -> Optional[str]: raise NotImplementedError # pragma: no cover - def __uuid__(self) -> str: - return self._uuid + def execute(self, context: Context) -> None: + assert isinstance(context, MapOperationsContext) + self.execute_map(context) + + def execute_map(self, context: "MapOperationsContext") -> None: + raise NotImplementedError # pragma: no cover class GetColumn(MapOperator): def __init__(self, name: str, input_type: pa.DataType): - super().__init__(name) + super().__init__(name, str(input_type)) self._name = name self._output_type = input_type @@ -50,13 +39,13 @@ def output_type(self) -> pa.DataType: def output_name(self) -> Optional[str]: return self._name - def execute(self, context: "MapOperationsContext") -> None: + def execute_map(self, context: "MapOperationsContext") -> None: # type: ignore context[self] = context.df[self._name] class LitColumn(MapOperator): def __init__(self, value: Any, input_type: Optional[pa.DataType] = None): - super().__init__(value) + super().__init__(value, str(input_type)) self._value = value self._output_type = pa.scalar(value, input_type).type @@ -64,7 +53,7 @@ def __init__(self, value: Any, input_type: Optional[pa.DataType] = None): def output_type(self) -> pa.DataType: return self._output_type - def execute(self, context: "MapOperationsContext") -> None: + def execute_map(self, context: "MapOperationsContext") -> None: context[self] = self._value @@ -83,7 +72,7 @@ def output_type(self) -> pa.DataType: def output_name(self) -> Optional[str]: return self._col.output_name - def execute(self, context: "MapOperationsContext") -> None: + def execute_map(self, context: "MapOperationsContext") -> None: if self._op in ["+", "-"]: context[self] = context.utils.unary_arithmetic_op( context[self._col], op=self._op @@ -122,7 +111,7 @@ def __init__(self, op: str, col1: MapOperator, col2: MapOperator): def output_type(self) -> pa.DataType: return self._output_type - def execute(self, context: "MapOperationsContext") -> None: + def execute_map(self, context: "MapOperationsContext") -> None: if self._op in ["+", "-", "*", "/"]: res = context.utils.binary_arithmetic_op( context[self._col1], context[self._col2], op=self._op @@ -192,8 +181,15 @@ def __init__(self, *args: Union[MapOperator, Tuple[MapOperator, str]]): (x, x.output_name) if isinstance(x, MapOperator) else x for x in args ] super().__init__(*self._data) + self._output_schema = pa.schema( + [pa.field(x[1], x[0].output_type) for x in self._data] + ) - def execute(self, context: "MapOperationsContext") -> None: + @property + def output_schema(self) -> pa.Schema: + return self._output_schema + + def execute_map(self, context: "MapOperationsContext") -> None: cols = [context[x] for x, _ in self._data] names = [y for _, y in self._data] context.set_output( @@ -201,70 +197,37 @@ def execute(self, context: "MapOperationsContext") -> None: ) -class MapOperationsContext: +class MapOperationsContext(Context): def __init__(self, utils: SlideUtils, df: Any): - self._utils = utils + super().__init__(utils) self._df = df - self._output: Any = None - self._results: Dict[str, Any] = {} - - @property - def utils(self) -> SlideUtils: - return self._utils @property def df(self) -> Any: return self._df - @property - def output(self) -> Any: - return self._output - - def set_output(self, df: Any) -> None: - self._output = df - - def __setitem__(self, op: MapOperator, value: Any) -> None: - self._results[op.key] = value - def __getitem__(self, op: MapOperator) -> None: - return self._results[op.key] - - -class MapExecutionPlan: +class MapExecutionPlan(Graph): def __init__(self, input_schema: pa.Schema): + super().__init__() self._input_schema = input_schema - self._steps: List[MapOperator] = [] - self._steps_dict: Dict[str, MapOperator] = {} - - def add(self, op: MapOperator) -> MapOperator: - key = op.key - if key in self._steps_dict: - return self._steps_dict[key] - self._steps_dict[key] = op - self._steps.append(op) - return op - - def col(self, name: str) -> MapOperator: + + def col(self, name: str) -> Operator: return self.add(GetColumn(name, self._input_schema.field_by_name(name).type)) - def lit(self, value: Any, input_type: Optional[pa.DataType] = None) -> MapOperator: + def lit(self, value: Any, input_type: Optional[pa.DataType] = None) -> Operator: return self.add(LitColumn(value, input_type)) - def unary(self, op: str, col: MapOperator) -> MapOperator: + def unary(self, op: str, col: MapOperator) -> Operator: return self.add(UnaryOperator(op, col)) - def binary(self, op: str, col1: MapOperator, col2: MapOperator) -> MapOperator: + def binary(self, op: str, col1: MapOperator, col2: MapOperator) -> Operator: return self.add(BinaryOperator(op, col1, col2)) def output(self, *args: Union[MapOperator, Tuple[MapOperator, str]]) -> None: - self.add(MapOutputOperator(*args)) - - def __len__(self) -> int: - return len(self._steps) + self.set_output_schema( + self.add(MapOutputOperator(*args)).output_schema # type: ignore + ) def __uuid__(self) -> str: return to_uuid(str(Schema(self._input_schema)), self._steps) - - def execute(self, context: MapOperationsContext) -> None: - for step in self._steps: - step.execute(context) diff --git a/tests/slide/operators/test_df_operators.py b/tests/slide/operators/test_df_operators.py new file mode 100644 index 0000000..6bd2e36 --- /dev/null +++ b/tests/slide/operators/test_df_operators.py @@ -0,0 +1,34 @@ +import pandas as pd +from slide.operators.df_operators import ExecutionPlan +from slide.operators.graph import Context +from slide.operators.map_operators import MapExecutionPlan +from slide_pandas import PandasUtils +from slide_test.utils import assert_duck_eq +from triad import Schema + + +def test_simple_plan(): + def build(map_plan: MapExecutionPlan) -> None: + a = map_plan.col("a") + b = map_plan.col("b") + c = map_plan.binary("+", a, b) + map_plan.output(a, b, (c, "c")) + + pdf = pd.DataFrame([[0, 1.2], [2, 3.1]], columns=["a", "b"]) + plan = ExecutionPlan() + df = plan.df(pdf, Schema("a:long,b:double").pa_schema) + assert Schema(df.output_schema) == "a:long,b:double" + df = plan.select(df, build) + assert Schema(df.output_schema) == "a:long,b:double,c:double" + plan.output(df) + assert Schema(plan.output_schema) == "a:long,b:double,c:double" + + ctx = Context(PandasUtils()) + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT a, b, a+b AS c FROM a", + a=pdf, + check_order=False, + ) diff --git a/tests/slide/operators/test_select_operators.py b/tests/slide/operators/test_map_operators.py similarity index 98% rename from tests/slide/operators/test_select_operators.py rename to tests/slide/operators/test_map_operators.py index 49ec88b..07ab733 100644 --- a/tests/slide/operators/test_select_operators.py +++ b/tests/slide/operators/test_map_operators.py @@ -305,10 +305,11 @@ def test_plan(): col6 = plan.unary("-", col5) # a, b, a+b as x, a+b-2 as y, -(a+b) as z plan.output(col1, col2, (col3, "x"), (col4, "y"), (col6, "z")) + assert Schema(plan.output_schema) == "b:float,a:int,x:double,y:double,z:double" plan.execute(ctx) assert_duck_eq( ctx.output, - "SELECT a, b, a+b AS x, a+b-2 AS y, -(a+b) AS z FROM a", + "SELECT b, a, a+b AS x, a+b-2 AS y, -(a+b) AS z FROM a", a=pdf, check_order=False, ) @@ -325,9 +326,8 @@ def test_plan_uuid(): plan3.col("b") tid = to_uuid(plan2) assert to_uuid(plan3) != to_uuid(plan2) - + plan2.col("a") assert tid == to_uuid(plan2) plan2.col("b") assert tid != to_uuid(plan2) - From bb358f8b6697be9356b213a1adfc7cbcfaef7ec1 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 14 Dec 2021 08:46:30 +0000 Subject: [PATCH 6/7] prepare for union --- slide/_type_utils.py | 179 +++++++++++++++++++++ slide/operators/df_operators.py | 8 +- slide/operators/graph.py | 10 +- tests/slide/operators/test_df_operators.py | 7 +- tests/slide/test_type_utils.py | 29 ++++ 5 files changed, 221 insertions(+), 12 deletions(-) create mode 100644 slide/_type_utils.py create mode 100644 tests/slide/test_type_utils.py diff --git a/slide/_type_utils.py b/slide/_type_utils.py new file mode 100644 index 0000000..17ea978 --- /dev/null +++ b/slide/_type_utils.py @@ -0,0 +1,179 @@ +from typing import Dict, Iterable, Tuple +import pyarrow as pa +from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP + +_ORDERED_PYARROW_TYPES = [ + pa.bool_(), + pa.int8(), + pa.uint8(), + pa.int16(), + pa.uint16(), + pa.int32(), + pa.uint32(), + pa.int64(), + pa.uint64(), + pa.float16(), + pa.float32(), + pa.float64(), + pa.string(), +] + + +def _generate_union_inference_types() -> Iterable[ # noqa: C901 + Tuple[pa.DataType, pa.DataType, pa.DataType] +]: + a = pa.bool_() + for b in _ORDERED_PYARROW_TYPES[1:]: + yield a, b, b + a = pa.int8() + yield a, pa.bool_(), a + yield a, pa.uint8(), pa.int16() + yield a, pa.uint16(), pa.int32() + yield a, pa.uint32(), pa.int64() + yield a, pa.uint64(), pa.float64() + for b in [ + pa.int16(), + pa.int32(), + pa.int64(), + pa.float16(), + pa.float32(), + pa.float64(), + pa.string(), + ]: + yield a, b, b + + a = pa.uint8() + yield a, pa.bool_(), a + yield a, pa.int8(), pa.int16() + for b in _ORDERED_PYARROW_TYPES[3:]: + yield a, b, b + + a = pa.int16() + for b in _ORDERED_PYARROW_TYPES[:3]: + yield a, b, a + yield a, pa.uint16(), pa.int32() + yield a, pa.uint32(), pa.int64() + yield a, pa.uint64(), pa.float64() + for b in [ + pa.int32(), + pa.int64(), + pa.float16(), + pa.float32(), + pa.float64(), + pa.string(), + ]: + yield a, b, b + + a = pa.uint16() + yield a, pa.bool_(), a + yield a, pa.int8(), pa.int32() + yield a, pa.uint8(), a + yield a, pa.int16(), pa.int32() + for b in _ORDERED_PYARROW_TYPES[5:]: + yield a, b, b + + a = pa.int32() + for b in _ORDERED_PYARROW_TYPES[:5]: + yield a, b, a + yield a, pa.uint32(), pa.int64() + yield a, pa.uint64(), pa.float64() + for b in [ + pa.int64(), + pa.float16(), + pa.float32(), + pa.float64(), + pa.string(), + ]: + yield a, b, b + + a = pa.uint32() + yield a, pa.bool_(), a + yield a, pa.int8(), pa.int64() + yield a, pa.uint8(), a + yield a, pa.int16(), pa.int64() + yield a, pa.uint16(), a + for b in _ORDERED_PYARROW_TYPES[7:]: + yield a, b, b + + a = pa.int64() + for b in _ORDERED_PYARROW_TYPES[:7]: + yield a, b, a + yield a, pa.uint64(), pa.float64() + for b in [ + pa.float16(), + pa.float32(), + pa.float64(), + pa.string(), + ]: + yield a, b, b + + a = pa.uint64() + yield a, pa.bool_(), a + yield a, pa.int8(), pa.float64() + yield a, pa.uint8(), a + yield a, pa.int16(), pa.float64() + yield a, pa.uint16(), a + yield a, pa.int32(), pa.float64() + yield a, pa.uint32(), a + for b in _ORDERED_PYARROW_TYPES[9:]: + yield a, b, b + + a = pa.float16() + for b in _ORDERED_PYARROW_TYPES[:9]: + yield a, b, a + for b in _ORDERED_PYARROW_TYPES[10:]: + yield a, b, b + + a = pa.float32() + for b in _ORDERED_PYARROW_TYPES[:10]: + yield a, b, a + for b in _ORDERED_PYARROW_TYPES[11:]: + yield a, b, b + + a = pa.float64() + for b in _ORDERED_PYARROW_TYPES[:11]: + yield a, b, a + for b in _ORDERED_PYARROW_TYPES[10:]: + yield a, b, b + + a = pa.string() + for b in _ORDERED_PYARROW_TYPES[:12]: + yield a, b, a + + yield pa.date32(), pa.date64(), pa.date64() + yield pa.date64(), pa.date32(), pa.date64() + + +_UNION_INFERENCE_DICT: Dict[Tuple[pa.DataType, pa.DataType], pa.DataType] = { + (x[0], x[1]): x[2] for x in _generate_union_inference_types() +} + + +def infer_union_type( # noqa: C901 + t1: pa.DataType, + t2: pa.DataType, +) -> pa.DataType: + if t1 == t2: + return t1 + if pa.types.is_timestamp(t1): + if pa.types.is_timestamp(t2) or pa.types.is_date(t2): + return TRIAD_DEFAULT_TIMESTAMP + elif pa.types.is_string(t2): + return pa.string() + raise ValueError(f"can't infer unioned schema for {t1} and {t2}") + if pa.types.is_timestamp(t2): + if pa.types.is_timestamp(t1) or pa.types.is_date(t1): + return TRIAD_DEFAULT_TIMESTAMP + elif pa.types.is_string(t1): + return pa.string() + raise ValueError(f"can't infer unioned schema for {t1} and {t2}") + if pa.types.is_nested(t1) or pa.types.is_nested(t2): + raise ValueError(f"can't infer unioned schema for {t1} and {t2}") + if pa.types.is_binary(t1) or pa.types.is_binary(t2): + raise ValueError(f"can't infer unioned schema for {t1} and {t2}") + key = (t1, t2) + if key in _UNION_INFERENCE_DICT: + return _UNION_INFERENCE_DICT[key] + raise ValueError( # pragma: no cover + f"can't infer unioned schema for {t1} and {t2}" + ) diff --git a/slide/operators/df_operators.py b/slide/operators/df_operators.py index 44b127a..fc611ab 100644 --- a/slide/operators/df_operators.py +++ b/slide/operators/df_operators.py @@ -12,9 +12,9 @@ def output_schema(self) -> pa.Schema: class GetDataFrameOperator(DataFrameOperator): - def __init__(self, df: Any, input_schema: pa.Schema): - super().__init__(id(df), str(input_schema)) - self._df = df + def __init__(self, name: str, input_schema: pa.Schema): + super().__init__(name, str(input_schema)) + self._name = name self._schema = input_schema @property @@ -22,7 +22,7 @@ def output_schema(self) -> pa.Schema: return self._schema def execute(self, context: Context) -> None: - context[self] = self._df + context[self] = context[self._name] class SelectOperator(DataFrameOperator): diff --git a/slide/operators/graph.py b/slide/operators/graph.py index e4fe9ee..7f008bb 100644 --- a/slide/operators/graph.py +++ b/slide/operators/graph.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import pyarrow as pa from slide.exceptions import SlideInvalidOperation @@ -43,11 +43,11 @@ def output(self) -> Any: def set_output(self, df: Any) -> None: self._output = df - def __setitem__(self, op: Operator, value: Any) -> None: - self._results[op.key] = value + def __setitem__(self, op: Union[str, Operator], value: Any) -> None: + self._results[op if isinstance(op, str) else op.key] = value - def __getitem__(self, op: Operator) -> None: - return self._results[op.key] + def __getitem__(self, op: Union[str, Operator]) -> None: + return self._results[op if isinstance(op, str) else op.key] class Graph: diff --git a/tests/slide/operators/test_df_operators.py b/tests/slide/operators/test_df_operators.py index 6bd2e36..406bc0f 100644 --- a/tests/slide/operators/test_df_operators.py +++ b/tests/slide/operators/test_df_operators.py @@ -7,16 +7,15 @@ from triad import Schema -def test_simple_plan(): +def test_select(): def build(map_plan: MapExecutionPlan) -> None: a = map_plan.col("a") b = map_plan.col("b") c = map_plan.binary("+", a, b) map_plan.output(a, b, (c, "c")) - pdf = pd.DataFrame([[0, 1.2], [2, 3.1]], columns=["a", "b"]) plan = ExecutionPlan() - df = plan.df(pdf, Schema("a:long,b:double").pa_schema) + df = plan.df("a", Schema("a:long,b:double").pa_schema) assert Schema(df.output_schema) == "a:long,b:double" df = plan.select(df, build) assert Schema(df.output_schema) == "a:long,b:double,c:double" @@ -24,6 +23,8 @@ def build(map_plan: MapExecutionPlan) -> None: assert Schema(plan.output_schema) == "a:long,b:double,c:double" ctx = Context(PandasUtils()) + pdf = pd.DataFrame([[0, 1.2], [2, 3.1]], columns=["a", "b"]) + ctx["a"] = pdf plan.execute(ctx) assert_duck_eq( diff --git a/tests/slide/test_type_utils.py b/tests/slide/test_type_utils.py new file mode 100644 index 0000000..016d5da --- /dev/null +++ b/tests/slide/test_type_utils.py @@ -0,0 +1,29 @@ +import pyarrow as pa +from pytest import raises +from triad import Schema + +from slide._type_utils import infer_union_type + + +def test_infer_union_type(): + schema = Schema( + "a:int32,b:float32,c:string,d:datetime,e:date,f:[int],g:{a:str},h:binary" + ) + assert pa.int32() == infer_union_type(schema["a"].type, schema["a"].type) + assert pa.float32() == infer_union_type(schema["a"].type, schema["b"].type) + assert pa.string() == infer_union_type(schema["a"].type, schema["c"].type) + assert pa.string() == infer_union_type(schema["c"].type, schema["a"].type) + assert schema["d"].type == infer_union_type(schema["d"].type, schema["d"].type) + assert schema["d"].type == infer_union_type(schema["d"].type, schema["e"].type) + assert schema["d"].type == infer_union_type(schema["e"].type, schema["d"].type) + assert pa.string() == infer_union_type(schema["d"].type, schema["c"].type) + assert pa.string() == infer_union_type(schema["c"].type, schema["d"].type) + assert schema["f"].type == infer_union_type(schema["f"].type, schema["f"].type) + assert schema["g"].type == infer_union_type(schema["g"].type, schema["g"].type) + + raises(ValueError, lambda: infer_union_type(schema["f"].type, schema["g"].type)) + raises(ValueError, lambda: infer_union_type(schema["c"].type, schema["g"].type)) + raises(ValueError, lambda: infer_union_type(schema["d"].type, schema["a"].type)) + raises(ValueError, lambda: infer_union_type(schema["a"].type, schema["d"].type)) + raises(ValueError, lambda: infer_union_type(schema["a"].type, schema["h"].type)) + raises(ValueError, lambda: infer_union_type(schema["h"].type, schema["a"].type)) From 2ba386ba989ffef869755ca76813518a50c1d38a Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 16 Dec 2021 08:25:37 +0000 Subject: [PATCH 7/7] refactor operators --- slide/expressions/__init__.py | 0 slide/operators/df_operators.py | 73 --- slide/operators/execution_plan.py | 446 ++++++++++++++++++ slide/operators/graph.py | 9 +- slide/operators/map_operators.py | 233 --------- slide/utils.py | 19 + slide_test/suite.py | 15 + tests/slide/operators/test_df_operators.py | 35 -- .../slide/operators/test_execution_plan_df.py | 196 ++++++++ .../operators/test_execution_plan_map.py | 297 ++++++++++++ tests/slide/operators/test_map_operators.py | 333 ------------- 11 files changed, 980 insertions(+), 676 deletions(-) create mode 100644 slide/expressions/__init__.py delete mode 100644 slide/operators/df_operators.py create mode 100644 slide/operators/execution_plan.py delete mode 100644 slide/operators/map_operators.py delete mode 100644 tests/slide/operators/test_df_operators.py create mode 100644 tests/slide/operators/test_execution_plan_df.py create mode 100644 tests/slide/operators/test_execution_plan_map.py delete mode 100644 tests/slide/operators/test_map_operators.py diff --git a/slide/expressions/__init__.py b/slide/expressions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/slide/operators/df_operators.py b/slide/operators/df_operators.py deleted file mode 100644 index fc611ab..0000000 --- a/slide/operators/df_operators.py +++ /dev/null @@ -1,73 +0,0 @@ -from typing import Any, Callable, Optional - -import pyarrow as pa -from slide.operators.graph import Context, Graph, Operator -from slide.operators.map_operators import MapExecutionPlan, MapOperationsContext - - -class DataFrameOperator(Operator): - @property - def output_schema(self) -> pa.Schema: - raise NotImplementedError # pragma: no cover - - -class GetDataFrameOperator(DataFrameOperator): - def __init__(self, name: str, input_schema: pa.Schema): - super().__init__(name, str(input_schema)) - self._name = name - self._schema = input_schema - - @property - def output_schema(self) -> pa.Schema: - return self._schema - - def execute(self, context: Context) -> None: - context[self] = context[self._name] - - -class SelectOperator(DataFrameOperator): - def __init__( - self, df: DataFrameOperator, builder: Callable[[MapExecutionPlan], None] - ): - self._plan = MapExecutionPlan(df.output_schema) - builder(self._plan) - self._output_schema = self._plan.output_schema - self._df = df - super().__init__(df, self._plan) - - @property - def output_schema(self) -> pa.Schema: - return self._output_schema - - def execute(self, context: Context) -> None: - indf = context[self._df] - ctx = MapOperationsContext(context.utils, indf) - self._plan.execute(ctx) - context[self] = ctx.output - - -class OutputDataFrameOperator(DataFrameOperator): - def __init__(self, df: DataFrameOperator): - super().__init__(df) - self._df = df - - def execute(self, context: Context) -> None: - context.set_output(context[self._df]) - - -class ExecutionPlan(Graph): - def __init__(self): - super().__init__() - self._output_schema: Optional[pa.Schema] = None - - def df(self, df: Any, input_schema: pa.Schema) -> Operator: - return self.add(GetDataFrameOperator(df, input_schema)) - - def select( - self, df: DataFrameOperator, builder: Callable[[MapExecutionPlan], None] - ) -> Operator: - return self.add(SelectOperator(df, builder)) - - def output(self, df: DataFrameOperator) -> None: - self.add(OutputDataFrameOperator(df)) - self.set_output_schema(df.output_schema) diff --git a/slide/operators/execution_plan.py b/slide/operators/execution_plan.py new file mode 100644 index 0000000..dbddeda --- /dev/null +++ b/slide/operators/execution_plan.py @@ -0,0 +1,446 @@ +from builtins import isinstance +from typing import Any, Dict, List, Optional, Tuple, Union + +import pandas as pd +import pyarrow as pa +from slide._type_utils import infer_union_type +from slide.operators.graph import Context, Graph, Node, Operator + + +class DataFrameOperator(Operator): + def __init__(self, *args: Any, **kwargs: Any): + super().__init__(*args, **kwargs) + self._nodes: Dict[str, Node] = {} + + @property + def output_schema(self) -> pa.Schema: + raise NotImplementedError # pragma: no cover + + @property + def nodes(self) -> Dict[str, Node]: + return self._nodes + + +class MapOperator(Operator): + def __init__(self, *args: Any, **kwargs: Any): + super().__init__(*args, **kwargs) + + nodes = {x.node for x in args if isinstance(x, MapOperator)} + nodes = nodes.union( + x.node for x in kwargs.values() if isinstance(x, MapOperator) + ) + self._node = Node(nodes) + + @property + def output_type(self) -> pa.DataType: + raise NotImplementedError # pragma: no cover + + @property + def output_name(self) -> Optional[str]: + raise NotImplementedError # pragma: no cover + + @property + def node(self) -> Node: + return self._node + + +class GetDataFrameOperator(DataFrameOperator): + def __init__(self, name: str, input_schema: pa.Schema): + super().__init__(name, str(input_schema)) + self._name = name + self._schema = input_schema + + for f in input_schema: + self.nodes[f.name] = Node(set()) + + @property + def output_schema(self) -> pa.Schema: + return self._schema + + def execute(self, context: Context) -> None: + context[self] = context[self._name] + + +class UnionOperator(DataFrameOperator): + def __init__( + self, df1: DataFrameOperator, df2: DataFrameOperator, distinct: bool = True + ): + super().__init__(df1, df2, distinct) + fields1: List[pa.Field] = [] + fields2: List[pa.Field] = [] + for f1, f2 in zip(df1.output_schema, df2.output_schema): + inf_type = infer_union_type(f1.type, f2.type) + fields1.append(pa.field(f1.name, inf_type)) + fields2.append(pa.field(f2.name, inf_type)) + self.nodes[f1.name] = Node({df1.nodes[f1.name], df2.nodes[f2.name]}) + self._schema1 = pa.schema(fields1) + self._schema2 = pa.schema(fields2) + self._output_schema = self._schema1 + self._df1 = df1 + self._df2 = df2 + self._distinct = distinct + + @property + def output_schema(self) -> pa.Schema: + return self._output_schema + + def execute(self, context: Context) -> None: + df1 = context[self._df1] + df2 = context[self._df2] + df1 = context.utils.cast_df(df1, self._schema1, self._df1.output_schema) + df2 = context.utils.cast_df(df2, self._schema2, self._df2.output_schema) + context[self] = context.utils.union(df1, df2, unique=self._distinct) + + +class ExceptOperator(DataFrameOperator): + def __init__( + self, df1: DataFrameOperator, df2: DataFrameOperator, distinct: bool = True + ): + super().__init__(df1, df2, distinct) + self._df1 = df1 + self._df2 = df2 + self._distinct = distinct + for f1, f2 in zip(df1.output_schema, df2.output_schema): + self.nodes[f1.name] = Node({df1.nodes[f1.name], df2.nodes[f2.name]}) + + @property + def output_schema(self) -> pa.Schema: + return self._df1.output_schema + + def execute(self, context: Context) -> None: + df1 = context[self._df1] + df2 = context[self._df2] + context[self] = context.utils.except_df(df1, df2, unique=self._distinct) + + +class IntersectOperator(DataFrameOperator): + def __init__( + self, df1: DataFrameOperator, df2: DataFrameOperator, distinct: bool = True + ): + super().__init__(df1, df2, distinct) + self._df1 = df1 + self._df2 = df2 + self._distinct = distinct + for f1, f2 in zip(df1.output_schema, df2.output_schema): + self.nodes[f1.name] = Node({df1.nodes[f1.name], df2.nodes[f2.name]}) + + @property + def output_schema(self) -> pa.Schema: + return self._df1.output_schema + + def execute(self, context: Context) -> None: + df1 = context[self._df1] + df2 = context[self._df2] + context[self] = context.utils.intersect(df1, df2, unique=self._distinct) + + +class FilterOperator(DataFrameOperator): + def __init__(self, df: DataFrameOperator, filter_col: str, drop: bool = True): + super().__init__(df, filter_col, drop) + self._df = df + self._filter_col = filter_col + self._drop = drop + if not drop: + self._output_schema = df.output_schema + else: + self._output_schema = [x for x in df.output_schema if x.name != filter_col] + for f in self._output_schema: + self.nodes[f.name] = Node({df.nodes[f.name], df.nodes[filter_col]}) + + @property + def output_schema(self) -> pa.Schema: + return self._output_schema + + def execute(self, context: Context) -> None: + df = context[self._df] + res = context.utils.filter_df(df, df[self._filter_col]) + if self._drop: + res = context.utils.drop_columns(df, [self._filter_col]) + context[self] = res + + +class JoinOperator(DataFrameOperator): + def __init__(self, df1: DataFrameOperator, df2: DataFrameOperator, how: str): + super().__init__(df1, df2, how) + self._df1 = df1 + self._df2 = df2 + self._how = how + self._on = list( + set(f.name for f in df1.output_schema).intersection( # noqa: C401 + f.name for f in df2.output_schema + ) + ) + f1 = [f for f in df1.output_schema if f.name not in self._on] + f2 = [f for f in df2.output_schema if f.name not in self._on] + self._output_schema = pa.schema(f1 + f2) + + on_nodes = [df1.nodes[n] for n in self._on] + [df2.nodes[n] for n in self._on] + for f in f1: + self.nodes[f.name] = Node({df1.nodes[f.name], *on_nodes}) + for f in f2: + self.nodes[f.name] = Node({df2.nodes[f.name], *on_nodes}) + + @property + def output_schema(self) -> pa.Schema: + return self._output_schema + + def execute(self, context: Context) -> None: + df1 = context[self._df1] + df2 = context[self._df2] + res = context.utils.join(df1, df2, join_type=self._how, on=self._on) + context[self] = context.utils.select_columns( + res, [f.name for f in self.output_schema] + ) + + +class OutputDataFrameOperator(DataFrameOperator): + def __init__(self, df: DataFrameOperator): + super().__init__(df) + self._df = df + + for k, v in df.nodes.items(): + self.nodes[k] = v + + def execute(self, context: Context) -> None: + context.set_output(context[self._df]) + + +class GetColumn(MapOperator): + def __init__(self, df: DataFrameOperator, name: str): + super().__init__(df, name) + self._name = name + self._df = df + self._node = df.nodes[name] + + @property + def output_type(self) -> pa.DataType: + return self._df.output_schema.field_by_name(self._name).type + + @property + def output_name(self) -> Optional[str]: + return self._name + + def execute(self, context: Context) -> None: # type: ignore + context[self] = context[self._df][self._name] + + +class LitColumn(MapOperator): + def __init__(self, value: Any, input_type: Optional[pa.DataType] = None): + super().__init__(value, str(input_type)) + self._value = value + self._output_type = pa.scalar(value, input_type).type + + @property + def output_type(self) -> pa.DataType: + return self._output_type + + def execute(self, context: Context) -> None: + context[self] = self._value + + +class UnaryOperator(MapOperator): + def __init__(self, op: str, col: MapOperator): + super().__init__(op, col) + self._op = op + self._col = col + self._output_type = self._get_output_type(op, col.output_type) + + @property + def output_type(self) -> pa.DataType: + return self._output_type + + @property + def output_name(self) -> Optional[str]: + return self._col.output_name + + def execute(self, context: Context) -> None: + if self._op in ["+", "-"]: + context[self] = context.utils.unary_arithmetic_op( + context[self._col], op=self._op + ) + elif self._op == "~": + context[self] = context.utils.logical_not(context[self._col]) + else: + raise NotImplementedError(self._op) # pragma: no cover + + def _get_output_type(self, op: str, input_type: pa.DataType) -> pa.DataType: + if op == "+": + if pa.types.is_integer(input_type) or pa.types.is_floating(input_type): + return input_type + elif op == "-": + if pa.types.is_integer(input_type): + return pa.int64() + if pa.types.is_floating(input_type): + return input_type + elif op == "~": + if pa.types.is_boolean(input_type): + return input_type + raise ValueError(f"'{op}' can't be applied to {input_type}") + + +class BinaryOperator(MapOperator): + def __init__(self, op: str, col1: MapOperator, col2: MapOperator): + super().__init__(op, col1, col2) + self._op = op + self._col1 = col1 + self._col2 = col2 + self._output_type = self._get_output_type( + op, col1.output_type, col2.output_type + ) + + @property + def output_type(self) -> pa.DataType: + return self._output_type + + def execute(self, context: Context) -> None: + if self._op in ["+", "-", "*", "/"]: + res = context.utils.binary_arithmetic_op( + context[self._col1], context[self._col2], op=self._op + ) + if ( # int/int -> int + pa.types.is_integer(self._col1.output_type) + and pa.types.is_integer(self._col2.output_type) + and not pd.api.types.is_integer_dtype(res.dtype) + ): + res = context.utils.cast(res, "int64") + context[self] = res + elif self._op in ["&", "|"]: + context[self] = context.utils.binary_logical_op( + context[self._col1], + context[self._col2], + op="and" if self._op == "&" else "or", + ) + else: + raise NotImplementedError(self._op) # pragma: no cover + + def _get_output_type( # noqa: C901 + self, op: str, t1: pa.DataType, t2: pa.DataType + ) -> pa.DataType: + if op == "+": + if pa.types.is_integer(t1): + if pa.types.is_integer(t2): + return pa.int64() + if pa.types.is_floating(t2): + return pa.float64() + elif pa.types.is_floating(t1): + if pa.types.is_integer(t2) or pa.types.is_floating(t2): + return pa.float64() + # TODO: time + interval + if op == "-": + if pa.types.is_integer(t1): + if pa.types.is_integer(t2): + return pa.int64() + if pa.types.is_floating(t2): + return pa.float64() + elif pa.types.is_floating(t1): + if pa.types.is_integer(t2) or pa.types.is_floating(t2): + return pa.float64() + # TODO: time - interval + # TODO: time - time + elif op in ["*", "/"]: + if pa.types.is_integer(t1): + if pa.types.is_integer(t2): + return pa.int64() + if pa.types.is_floating(t2): + return pa.float64() + elif pa.types.is_floating(t1): + if pa.types.is_integer(t2) or pa.types.is_floating(t2): + return pa.float64() + elif op in ["&", "|"]: + if (pa.types.is_boolean(t1) or pa.types.is_null(t1)) and ( + pa.types.is_boolean(t2) or pa.types.is_null(t2) + ): + return pa.bool_() + raise ValueError( # pragma: no cover + f"'{op}' can't be applied to {t1} and {t2}" + ) + + +class ColsToDataFrameOperator(DataFrameOperator): + def __init__( + self, + *args: Union[MapOperator, Tuple[MapOperator, str]], + reference: DataFrameOperator, + ): + self._data: List[Any] = [ + (x, x.output_name) if isinstance(x, MapOperator) else x for x in args + ] + super().__init__(*self._data, reference) + self._output_schema = pa.schema( + [pa.field(x[1], x[0].output_type) for x in self._data] + ) + + self._ref = reference + self._nodes = {v: k.node for k, v in self._data} + + @property + def output_schema(self) -> pa.Schema: + return self._output_schema + + @property + def nodes(self) -> Dict[str, Node]: + return self._nodes + + def execute(self, context: Context) -> None: + cols = [context[x] for x, _ in self._data] + names = [y for _, y in self._data] + context[self] = context.utils.cols_to_df( + cols, names=names, reference=context[self._ref] + ) + + +class ExecutionPlan(Graph): + def __init__(self): + super().__init__() + self._output_schema: Optional[pa.Schema] = None + + def df(self, df: Any, input_schema: pa.Schema) -> Operator: + return self.add(GetDataFrameOperator(df, input_schema)) + + def union( + self, df1: DataFrameOperator, df2: DataFrameOperator, distinct: bool = True + ) -> Operator: + return self.add(UnionOperator(df1, df2, distinct=distinct)) + + def except_df( + self, df1: DataFrameOperator, df2: DataFrameOperator, distinct: bool = True + ) -> Operator: + return self.add(ExceptOperator(df1, df2, distinct=distinct)) + + def intersect( + self, df1: DataFrameOperator, df2: DataFrameOperator, distinct: bool = True + ) -> Operator: + return self.add(IntersectOperator(df1, df2, distinct=distinct)) + + def filter_df( + self, df: DataFrameOperator, filter_col: str, drop: bool = True + ) -> Operator: + return self.add(FilterOperator(df, filter_col, drop)) + + def join( + self, df1: DataFrameOperator, df2: DataFrameOperator, how: str + ) -> Operator: + return self.add(JoinOperator(df1, df2, how)) + + def output(self, df: DataFrameOperator) -> None: + self.add(OutputDataFrameOperator(df)) + self.set_output_schema(df.output_schema) + + def col(self, df: DataFrameOperator, name: str) -> Operator: + return self.add(GetColumn(df, name)) + + def lit(self, value: Any, input_type: Optional[pa.DataType] = None) -> Operator: + return self.add(LitColumn(value, input_type)) + + def unary(self, op: str, col: MapOperator) -> Operator: + return self.add(UnaryOperator(op, col)) + + def binary(self, op: str, col1: MapOperator, col2: MapOperator) -> Operator: + return self.add(BinaryOperator(op, col1, col2)) + + def cols_to_df( + self, + *args: Union[MapOperator, Tuple[MapOperator, str]], + reference: DataFrameOperator, + ) -> Operator: + return self.add(ColsToDataFrameOperator(*args, reference=reference)) diff --git a/slide/operators/graph.py b/slide/operators/graph.py index 7f008bb..fcb588d 100644 --- a/slide/operators/graph.py +++ b/slide/operators/graph.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Set import pyarrow as pa from slide.exceptions import SlideInvalidOperation @@ -6,6 +6,11 @@ from triad import assert_or_throw, to_uuid +class Node: + def __init__(self, parents: Set["Node"]): + self._parents = parents + + class Operator: def __init__(self, *args: Any, **kwargs: Any): self._uuid = to_uuid(self.identifier, args, kwargs) @@ -46,7 +51,7 @@ def set_output(self, df: Any) -> None: def __setitem__(self, op: Union[str, Operator], value: Any) -> None: self._results[op if isinstance(op, str) else op.key] = value - def __getitem__(self, op: Union[str, Operator]) -> None: + def __getitem__(self, op: Union[str, Operator]) -> Any: return self._results[op if isinstance(op, str) else op.key] diff --git a/slide/operators/map_operators.py b/slide/operators/map_operators.py deleted file mode 100644 index e42b1b1..0000000 --- a/slide/operators/map_operators.py +++ /dev/null @@ -1,233 +0,0 @@ -from typing import Any, List, Optional, Tuple, Union - -import pandas as pd -import pyarrow as pa - -from slide.operators.graph import Context, Graph, Operator -from slide.utils import SlideUtils -from triad import Schema, to_uuid - - -class MapOperator(Operator): - @property - def output_type(self) -> pa.DataType: - raise NotImplementedError # pragma: no cover - - @property - def output_name(self) -> Optional[str]: - raise NotImplementedError # pragma: no cover - - def execute(self, context: Context) -> None: - assert isinstance(context, MapOperationsContext) - self.execute_map(context) - - def execute_map(self, context: "MapOperationsContext") -> None: - raise NotImplementedError # pragma: no cover - - -class GetColumn(MapOperator): - def __init__(self, name: str, input_type: pa.DataType): - super().__init__(name, str(input_type)) - self._name = name - self._output_type = input_type - - @property - def output_type(self) -> pa.DataType: - return self._output_type - - @property - def output_name(self) -> Optional[str]: - return self._name - - def execute_map(self, context: "MapOperationsContext") -> None: # type: ignore - context[self] = context.df[self._name] - - -class LitColumn(MapOperator): - def __init__(self, value: Any, input_type: Optional[pa.DataType] = None): - super().__init__(value, str(input_type)) - self._value = value - self._output_type = pa.scalar(value, input_type).type - - @property - def output_type(self) -> pa.DataType: - return self._output_type - - def execute_map(self, context: "MapOperationsContext") -> None: - context[self] = self._value - - -class UnaryOperator(MapOperator): - def __init__(self, op: str, col: MapOperator): - super().__init__(op, col) - self._op = op - self._col = col - self._output_type = self._get_output_type(op, col.output_type) - - @property - def output_type(self) -> pa.DataType: - return self._output_type - - @property - def output_name(self) -> Optional[str]: - return self._col.output_name - - def execute_map(self, context: "MapOperationsContext") -> None: - if self._op in ["+", "-"]: - context[self] = context.utils.unary_arithmetic_op( - context[self._col], op=self._op - ) - elif self._op == "~": - context[self] = context.utils.logical_not(context[self._col]) - else: - raise NotImplementedError(self._op) # pragma: no cover - - def _get_output_type(self, op: str, input_type: pa.DataType) -> pa.DataType: - if op == "+": - if pa.types.is_integer(input_type) or pa.types.is_floating(input_type): - return input_type - elif op == "-": - if pa.types.is_integer(input_type): - return pa.int64() - if pa.types.is_floating(input_type): - return input_type - elif op == "~": - if pa.types.is_boolean(input_type): - return input_type - raise ValueError(f"'{op}' can't be applied to {input_type}") - - -class BinaryOperator(MapOperator): - def __init__(self, op: str, col1: MapOperator, col2: MapOperator): - super().__init__(op, col1, col2) - self._op = op - self._col1 = col1 - self._col2 = col2 - self._output_type = self._get_output_type( - op, col1.output_type, col2.output_type - ) - - @property - def output_type(self) -> pa.DataType: - return self._output_type - - def execute_map(self, context: "MapOperationsContext") -> None: - if self._op in ["+", "-", "*", "/"]: - res = context.utils.binary_arithmetic_op( - context[self._col1], context[self._col2], op=self._op - ) - if ( # int/int -> int - pa.types.is_integer(self._col1.output_type) - and pa.types.is_integer(self._col2.output_type) - and not pd.api.types.is_integer_dtype(res.dtype) - ): - res = context.utils.cast(res, "int64") - context[self] = res - elif self._op in ["&", "|"]: - context[self] = context.utils.binary_logical_op( - context[self._col1], - context[self._col2], - op="and" if self._op == "&" else "or", - ) - else: - raise NotImplementedError(self._op) # pragma: no cover - - def _get_output_type( # noqa: C901 - self, op: str, t1: pa.DataType, t2: pa.DataType - ) -> pa.DataType: - if op == "+": - if pa.types.is_integer(t1): - if pa.types.is_integer(t2): - return pa.int64() - if pa.types.is_floating(t2): - return pa.float64() - elif pa.types.is_floating(t1): - if pa.types.is_integer(t2) or pa.types.is_floating(t2): - return pa.float64() - # TODO: time + interval - if op == "-": - if pa.types.is_integer(t1): - if pa.types.is_integer(t2): - return pa.int64() - if pa.types.is_floating(t2): - return pa.float64() - elif pa.types.is_floating(t1): - if pa.types.is_integer(t2) or pa.types.is_floating(t2): - return pa.float64() - # TODO: time - interval - # TODO: time - time - elif op in ["*", "/"]: - if pa.types.is_integer(t1): - if pa.types.is_integer(t2): - return pa.int64() - if pa.types.is_floating(t2): - return pa.float64() - elif pa.types.is_floating(t1): - if pa.types.is_integer(t2) or pa.types.is_floating(t2): - return pa.float64() - elif op in ["&", "|"]: - if (pa.types.is_boolean(t1) or pa.types.is_null(t1)) and ( - pa.types.is_boolean(t2) or pa.types.is_null(t2) - ): - return pa.bool_() - raise ValueError( # pragma: no cover - f"'{op}' can't be applied to {t1} and {t2}" - ) - - -class MapOutputOperator(MapOperator): - def __init__(self, *args: Union[MapOperator, Tuple[MapOperator, str]]): - self._data: List[Any] = [ - (x, x.output_name) if isinstance(x, MapOperator) else x for x in args - ] - super().__init__(*self._data) - self._output_schema = pa.schema( - [pa.field(x[1], x[0].output_type) for x in self._data] - ) - - @property - def output_schema(self) -> pa.Schema: - return self._output_schema - - def execute_map(self, context: "MapOperationsContext") -> None: - cols = [context[x] for x, _ in self._data] - names = [y for _, y in self._data] - context.set_output( - context.utils.cols_to_df(cols, names=names, reference=context.df) - ) - - -class MapOperationsContext(Context): - def __init__(self, utils: SlideUtils, df: Any): - super().__init__(utils) - self._df = df - - @property - def df(self) -> Any: - return self._df - - -class MapExecutionPlan(Graph): - def __init__(self, input_schema: pa.Schema): - super().__init__() - self._input_schema = input_schema - - def col(self, name: str) -> Operator: - return self.add(GetColumn(name, self._input_schema.field_by_name(name).type)) - - def lit(self, value: Any, input_type: Optional[pa.DataType] = None) -> Operator: - return self.add(LitColumn(value, input_type)) - - def unary(self, op: str, col: MapOperator) -> Operator: - return self.add(UnaryOperator(op, col)) - - def binary(self, op: str, col1: MapOperator, col2: MapOperator) -> Operator: - return self.add(BinaryOperator(op, col1, col2)) - - def output(self, *args: Union[MapOperator, Tuple[MapOperator, str]]) -> None: - self.set_output_schema( - self.add(MapOutputOperator(*args)).output_schema # type: ignore - ) - - def __uuid__(self) -> str: - return to_uuid(str(Schema(self._input_schema)), self._steps) diff --git a/slide/utils.py b/slide/utils.py index ea319b2..3b1f938 100644 --- a/slide/utils.py +++ b/slide/utils.py @@ -734,6 +734,25 @@ def get_fields() -> Iterable[pa.Field]: return pa.schema(list(get_fields())) + def drop_columns(self, df: TDf, columns: List[str]) -> TDf: + """Drop columns from the dataframe + + :param df: the dataframe + :param columns: columns to be dropped + :return: the new dataframe without those columns + """ + cols = [c for c in df.columns if c not in columns] + return df[cols] + + def select_columns(self, df: TDf, columns: List[str]) -> TDf: + """Select columns from the dataframe + + :param df: the dataframe + :param columns: columns to be chosen + :return: the new dataframe with those columns + """ + return df[columns] + def cast_df( # noqa: C901 self, df: TDf, schema: pa.Schema, input_schema: Optional[pa.Schema] = None ) -> TDf: diff --git a/slide_test/suite.py b/slide_test/suite.py index d6cb35b..c287f53 100644 --- a/slide_test/suite.py +++ b/slide_test/suite.py @@ -2718,3 +2718,18 @@ def test_join_multi_sql(self): b=b, c=c, ) + + def test_drop_select_columns(self): + pdf = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "c"]) + df = self.to_df(pdf) + assert_pdf_eq( + self.to_pd(self.utils.select_columns(df, ["a", "c"])), + pd.DataFrame([[0, 2], [3, 5]], columns=["a", "c"]), + check_order=False, + ) + + assert_pdf_eq( + self.to_pd(self.utils.drop_columns(df, ["a", "c"])), + pd.DataFrame([[1], [4]], columns=["b"]), + check_order=False, + ) diff --git a/tests/slide/operators/test_df_operators.py b/tests/slide/operators/test_df_operators.py deleted file mode 100644 index 406bc0f..0000000 --- a/tests/slide/operators/test_df_operators.py +++ /dev/null @@ -1,35 +0,0 @@ -import pandas as pd -from slide.operators.df_operators import ExecutionPlan -from slide.operators.graph import Context -from slide.operators.map_operators import MapExecutionPlan -from slide_pandas import PandasUtils -from slide_test.utils import assert_duck_eq -from triad import Schema - - -def test_select(): - def build(map_plan: MapExecutionPlan) -> None: - a = map_plan.col("a") - b = map_plan.col("b") - c = map_plan.binary("+", a, b) - map_plan.output(a, b, (c, "c")) - - plan = ExecutionPlan() - df = plan.df("a", Schema("a:long,b:double").pa_schema) - assert Schema(df.output_schema) == "a:long,b:double" - df = plan.select(df, build) - assert Schema(df.output_schema) == "a:long,b:double,c:double" - plan.output(df) - assert Schema(plan.output_schema) == "a:long,b:double,c:double" - - ctx = Context(PandasUtils()) - pdf = pd.DataFrame([[0, 1.2], [2, 3.1]], columns=["a", "b"]) - ctx["a"] = pdf - plan.execute(ctx) - - assert_duck_eq( - ctx.output, - "SELECT a, b, a+b AS c FROM a", - a=pdf, - check_order=False, - ) diff --git a/tests/slide/operators/test_execution_plan_df.py b/tests/slide/operators/test_execution_plan_df.py new file mode 100644 index 0000000..147bda4 --- /dev/null +++ b/tests/slide/operators/test_execution_plan_df.py @@ -0,0 +1,196 @@ +import pandas as pd +from slide.operators.execution_plan import ExecutionPlan +from slide.operators.graph import Context +from slide_pandas import PandasUtils +from slide_test.utils import assert_duck_eq +from triad import Schema + + +def test_select(): + plan = ExecutionPlan() + df = plan.df("a", Schema("a:long,b:double").pa_schema) + assert Schema(df.output_schema) == "a:long,b:double" + a = plan.col(df, "a") + ln = len(plan) + plan.col(df, "a") + assert ln == len(plan) # dedup + b = plan.col(df, "b") + c = plan.binary("+", a, b) + df = plan.cols_to_df(a, b, (c, "c"), reference=df) + assert Schema(df.output_schema) == "a:long,b:double,c:double" + plan.output(df) + assert Schema(plan.output_schema) == "a:long,b:double,c:double" + + ctx = Context(PandasUtils()) + pdf = pd.DataFrame([[0, 1.2], [2, 3.1]], columns=["a", "b"]) + ctx["a"] = pdf + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT a, b, a+b AS c FROM a", + a=pdf, + check_order=False, + ) + + +def test_union(): + plan = ExecutionPlan() + df1 = plan.df("a", Schema("a:uint32,b:int,c:bool").pa_schema) + df2 = plan.df("b", Schema("aa:int8,bb:double,cc:str").pa_schema) + df = plan.union(df1, df2) + assert Schema(df.output_schema) == "a:long,b:double,c:str" + plan.output(df) + assert Schema(plan.output_schema) == "a:long,b:double,c:str" + + ctx = Context(PandasUtils()) + pdf1 = pd.DataFrame([[0, None, True], [2, 3, False]], columns=["a", "b", "c"]) + ctx["a"] = pdf1 + pdf2 = pd.DataFrame([[-1, 1.1, "x"], [-2, 3.1, None]], columns=["aa", "bb", "cc"]) + ctx["b"] = pdf2 + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT a, b, c FROM a UNION SELECT aa,bb,cc FROM b", + a=pdf1, + b=pdf2, + check_order=False, + ) + + +def test_union_all(): + plan = ExecutionPlan() + df1 = plan.df("a", Schema("a:uint32,b:int,c:bool").pa_schema) + df = plan.union(df1, df1, distinct=False) + assert Schema(df.output_schema) == Schema("a:uint32,b:int,c:bool") + plan.output(df) + assert Schema(plan.output_schema) == Schema("a:uint32,b:int,c:bool") + + ctx = Context(PandasUtils()) + pdf1 = pd.DataFrame([[0, None, True], [2, 3, False]], columns=["a", "b", "c"]) + ctx["a"] = pdf1 + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT a, b, c FROM a UNION ALL SELECT a, b, c FROM a", + a=pdf1, + check_order=False, + ) + + +def test_except(): + + plan = ExecutionPlan() + df1 = plan.df("a", Schema("a:uint32,b:int").pa_schema) + df2 = plan.df("b", Schema("a:long,b:long").pa_schema) + df = plan.except_df(df1, df2) + assert Schema(df.output_schema) == Schema("a:uint32,b:int") + plan.output(df) + assert Schema(plan.output_schema) == Schema("a:uint32,b:int") + + ctx = Context(PandasUtils()) + pdf1 = pd.DataFrame([[1, 2], [2, 3], [1, 2], [2, 3]], columns=["a", "b"]) + ctx["a"] = pdf1 + pdf2 = pd.DataFrame([[2, 3], [4, 5]], columns=["a", "b"]) + ctx["b"] = pdf2 + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT a, b FROM a EXCEPT SELECT a, b FROM b", + a=pdf1, + b=pdf2, + check_order=False, + ) + + +def test_intersect(): + plan = ExecutionPlan() + df1 = plan.df("a", Schema("a:uint32,b:int").pa_schema) + df2 = plan.df("b", Schema("a:long,b:long").pa_schema) + df = plan.intersect(df1, df2) + assert Schema(df.output_schema) == Schema("a:uint32,b:int") + plan.output(df) + assert Schema(plan.output_schema) == Schema("a:uint32,b:int") + + ctx = Context(PandasUtils()) + pdf1 = pd.DataFrame([[1, 2], [2, 3], [1, 2], [2, 3]], columns=["a", "b"]) + ctx["a"] = pdf1 + pdf2 = pd.DataFrame([[2, 3], [4, 5]], columns=["a", "b"]) + ctx["b"] = pdf2 + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT a, b FROM a INTERSECT SELECT a, b FROM b", + a=pdf1, + b=pdf2, + check_order=False, + ) + + +def test_filter(): + plan = ExecutionPlan() + df = plan.df("a", Schema("a:long,b:bool,c:bool").pa_schema) + df = plan.filter_df(df, "c", drop=False) + assert Schema(df.output_schema) == Schema("a:long,b:bool,c:bool") + plan.output(df) + assert Schema(plan.output_schema) == Schema("a:long,b:bool,c:bool") + + ctx = Context(PandasUtils()) + pdf1 = pd.DataFrame([[1, True, False], [2, True, True]], columns=["a", "b", "c"]) + ctx["a"] = pdf1 + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT a,b,c FROM a WHERE c", + a=pdf1, + check_order=False, + ) + + plan = ExecutionPlan() + df = plan.df("a", Schema("a:long,b:bool,c:bool").pa_schema) + df = plan.filter_df(df, "b") + assert Schema(df.output_schema) == Schema("a:long,c:bool") + plan.output(df) + assert Schema(plan.output_schema) == Schema("a:long,c:bool") + + ctx = Context(PandasUtils()) + pdf1 = pd.DataFrame([[1, True, False], [2, True, True]], columns=["a", "b", "c"]) + ctx["a"] = pdf1 + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT a,c FROM a WHERE b", + a=pdf1, + check_order=False, + ) + + +def test_join(): + plan = ExecutionPlan() + df1 = plan.df("a", Schema("a:uint32,b:int").pa_schema) + df2 = plan.df("b", Schema("a:long,c:long").pa_schema) + df = plan.join(df1, df2, "inner") + assert Schema(df.output_schema) == Schema("b:int,c:long") + plan.output(df) + assert Schema(plan.output_schema) == Schema("b:int,c:long") + + ctx = Context(PandasUtils()) + pdf1 = pd.DataFrame([[1, 2], [2, 3], [1, 2], [2, 3]], columns=["a", "b"]) + ctx["a"] = pdf1 + pdf2 = pd.DataFrame([[2, 3], [-4, 5]], columns=["a", "c"]) + ctx["b"] = pdf2 + plan.execute(ctx) + + assert_duck_eq( + ctx.output, + "SELECT a.b, b.c FROM a INNER JOIN b ON a.a = b.a", + a=pdf1, + b=pdf2, + check_order=False, + ) diff --git a/tests/slide/operators/test_execution_plan_map.py b/tests/slide/operators/test_execution_plan_map.py new file mode 100644 index 0000000..a24ec29 --- /dev/null +++ b/tests/slide/operators/test_execution_plan_map.py @@ -0,0 +1,297 @@ +import pandas as pd +import pyarrow as pa +from pytest import raises +from slide.operators.execution_plan import ExecutionPlan +from slide.operators.graph import Context, Operator +from slide_pandas import PandasUtils +from slide_test.utils import assert_duck_eq +from triad import Schema + + +def test_col_op(): + def f(plan: ExecutionPlan, df: Operator): + col0 = plan.col(df, "c") + assert pa.bool_() == col0.output_type + assert "c" == col0.output_name + col1 = plan.col(df, "a") + assert pa.uint32() == col1.output_type + assert "a" == col1.output_name + col2 = plan.col(df, "b") + assert pa.float32() == col2.output_type + assert "b" == col2.output_name + + return col0, col1, col2 + + pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) + assert_duck_eq( + run_plan(pdf, "a:uint,b:float32,c:bool", f), + "SELECT c, a, b FROM a", + a=pdf, + check_order=False, + ) + + +def test_lit_op(): + def f(plan: ExecutionPlan, df: Operator): + col0 = plan.lit(None) + assert pa.null() == col0.output_type + col1 = plan.lit("abc") + assert pa.string() == col1.output_type + col2 = plan.lit(1, pa.uint8()) + assert pa.uint8() == col2.output_type + col3 = plan.col(df, "a") + + return (col1, "x"), (col2, "y"), col3 + + pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) + assert_duck_eq( + run_plan(pdf, "a:uint,b:float32,c:bool", f), + "SELECT 'abc' AS x, 1 AS y, a FROM a", + a=pdf, + check_order=False, + ) + + +def test_pure_lit_op(): + def f(plan: ExecutionPlan, df: Operator): + col0 = plan.lit(None) + assert pa.null() == col0.output_type + col1 = plan.lit("abc") + assert pa.string() == col1.output_type + col2 = plan.lit(1, pa.uint8()) + assert pa.uint8() == col2.output_type + col3 = plan.lit(b"\0abc") + assert pa.binary() == col3.output_type + col4 = plan.lit([1, 2]) + assert pa.types.is_nested(col4.output_type) + + return (col1, "a"), (col2, "b"), (col3, "c"), (col4, "d") + + pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) + res = run_plan(pdf, "a:uint,b:float32,c:bool", f) + expected = [["abc", 1, b"\0abc", [1, 2]], ["abc", 1, b"\0abc", [1, 2]]] + + assert expected == res.astype(object).values.tolist() + + +def test_unary_op(): + def f(plan: ExecutionPlan, df: Operator): + col0 = plan.col(df, "c") + col1 = plan.col(df, "a") + col2 = plan.col(df, "b") + col3 = plan.unary("+", col1) + assert pa.uint32() == col3.output_type + assert "a" == col3.output_name + col4 = plan.unary("-", col1) + assert pa.int64() == col4.output_type + col5 = plan.unary("+", col2) + assert pa.float32() == col5.output_type + col6 = plan.unary("-", col2) + assert pa.float32() == col6.output_type + + raises(ValueError, lambda: plan.unary("-", col0)) + raises(ValueError, lambda: plan.unary("+", col0)) + raises(ValueError, lambda: plan.unary("~", col1)) + raises(ValueError, lambda: plan.unary("~", col2)) + + col10 = plan.unary("~", col0) + return (col3, "c3"), (col4, "c4"), (col5, "c5"), (col6, "c6"), (col10, "c10") + + pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) + assert_duck_eq( + run_plan(pdf, "a:uint,b:float32,c:bool", f), + """ + SELECT + a AS c3, -a AS c4, + b AS c5, -b AS c6, + NOT c AS c10 + FROM a + """, + a=pdf, + check_order=False, + ) + + +def test_binary_op_num(): + def f(plan: ExecutionPlan, df: Operator): + col1 = plan.col(df, "a") + col2 = plan.col(df, "b") + cola = plan.binary("+", col1, col1) + assert pa.int64() == cola.output_type + colb = plan.binary("-", col1, col1) + assert pa.int64() == colb.output_type + colc = plan.binary("*", col1, col1) + assert pa.int64() == colc.output_type + cold = plan.binary("/", col1, plan.lit(2)) + assert pa.int64() == cold.output_type + + cole = plan.binary("+", col1, col2) + assert pa.float64() == cole.output_type + colf = plan.binary("-", col1, col2) + assert pa.float64() == colf.output_type + colg = plan.binary("*", col1, col2) + assert pa.float64() == colg.output_type + colh = plan.binary("/", col1, col2) + assert pa.float64() == colh.output_type + + coli = plan.binary("+", col2, col1) + assert pa.float64() == coli.output_type + colj = plan.binary("-", col2, col1) + assert pa.float64() == colj.output_type + colk = plan.binary("*", col2, col1) + assert pa.float64() == colk.output_type + coll = plan.binary("/", col2, col1) + assert pa.float64() == coll.output_type + + colm = plan.binary("+", col2, col2) + assert pa.float64() == colm.output_type + coln = plan.binary("-", col2, col2) + assert pa.float64() == coln.output_type + colo = plan.binary("*", col2, col2) + assert pa.float64() == colo.output_type + colp = plan.binary("/", col2, col2) + assert pa.float64() == colp.output_type + + return ( + (cola, "a"), + (colb, "b"), + (colc, "c"), + (cold, "d"), + (cole, "e"), + (colf, "f"), + (colg, "g"), + (colh, "h"), + (coli, "i"), + (colj, "j"), + (colk, "k"), + (coll, "l"), + (colm, "m"), + (coln, "n"), + (colo, "o"), + (colp, "p"), + ) + + pdf = pd.DataFrame([[1, 1.1], [3, 4.1]], columns=["a", "b"]) + assert_duck_eq( + run_plan(pdf, "a:uint,b:float32", f), + """ + SELECT + a+a AS a, a-a AS b, a*a AS c, a/2 AS d, + a+b AS e, a-b AS f, a*b AS g, a/b AS h, + b+a AS i, b-a AS j, b*a AS k, b/a AS l, + b+b AS m, b-b AS n, b*b AS o, b/b AS p + FROM a + """, + a=pdf, + check_order=False, + ) + + +def test_binary_op_logical(): + def f(plan: ExecutionPlan, df: Operator): + col1 = plan.col(df, "a") + col2 = plan.col(df, "b") + cola = plan.binary("&", col1, col2) + assert pa.bool_() == cola.output_type + colb = plan.binary("|", col1, col2) + assert pa.bool_() == colb.output_type + colc = plan.binary("&", col1, plan.lit(True)) + assert pa.bool_() == colc.output_type + cold = plan.binary("&", col1, plan.lit(False)) + assert pa.bool_() == cold.output_type + cole = plan.binary("&", col1, plan.lit(None)) + assert pa.bool_() == cole.output_type + colf = plan.binary("|", col1, plan.lit(True)) + assert pa.bool_() == colf.output_type + colg = plan.binary("|", col1, plan.lit(False)) + assert pa.bool_() == colg.output_type + colh = plan.binary("|", col1, plan.lit(None)) + assert pa.bool_() == colh.output_type + + return ( + (cola, "a"), + (colb, "b"), + (colc, "c"), + (cold, "d"), + (cole, "e"), + (colf, "f"), + (colg, "g"), + (colh, "h"), + ) + + pdf = pd.DataFrame( + [ + [True, True], + [True, False], + [True, None], + [False, True], + [False, False], + [False, None], + [None, True], + [None, False], + [None, None], + ], + columns=["a", "b"], + ) + assert_duck_eq( + run_plan(pdf, "a:bool,b:bool", f), + """ + SELECT + a AND b AS a, a OR b AS b, + a AND TRUE AS c, a AND FALSE AS d, a AND NULL AS e, + a OR TRUE AS f, a OR FALSE AS g, a OR NULL AS h + FROM a + """, + a=pdf, + check_order=False, + ) + + +def test_binary_op_logical_2(): + def f(plan: ExecutionPlan, df: Operator, sql): + output = [] + n = 0 + for op in ["&", "|"]: + for left in [True, False, None]: + for right in [True, False, None]: + name = f"_{n}" + col = plan.binary(op, plan.lit(left), plan.lit(right)) + assert pa.bool_() == col.output_type + output.append((col, name)) + ls = "NULL" if left is None else str(left).upper() + rs = "NULL" if right is None else str(right).upper() + o = "AND" if op == "&" else "OR" + sql.append(f"{ls} {o} {rs} AS {name}") + n += 1 + return output + + pdf = pd.DataFrame( + [ + [True, True], + [True, False], + ], + columns=["a", "b"], + ) + sql = [] + res = run_plan(pdf, "a:bool,b:bool", lambda a, b: f(a, b, sql)) + _sql = ", ".join(sql) + assert_duck_eq( + res, + f"SELECT {_sql} FROM a", + a=pdf, + check_order=False, + ) + + +def run_plan(pdf, schema, plan_func): + plan = ExecutionPlan() + df = plan.df("a", Schema(schema).pa_schema) + args = plan_func(plan, df) + res = plan.cols_to_df(*args, reference=df) + plan.output(res) + + ctx = Context(PandasUtils()) + ctx["a"] = pdf + plan.execute(ctx) + + return ctx.output diff --git a/tests/slide/operators/test_map_operators.py b/tests/slide/operators/test_map_operators.py deleted file mode 100644 index 07ab733..0000000 --- a/tests/slide/operators/test_map_operators.py +++ /dev/null @@ -1,333 +0,0 @@ -import pandas as pd -from slide.operators.map_operators import MapOperationsContext, MapExecutionPlan -from slide_pandas import PandasUtils -from slide_test.utils import assert_duck_eq, assert_pdf_eq -from triad import Schema, to_uuid -import pyarrow as pa -from pytest import raises - - -def test_col_op(): - pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) - ctx = MapOperationsContext(PandasUtils(), pdf) - plan = MapExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) - col0 = plan.col("c") - assert pa.bool_() == col0.output_type - assert "c" == col0.output_name - col1 = plan.col("a") - assert pa.uint32() == col1.output_type - assert "a" == col1.output_name - col2 = plan.col("b") - assert pa.float32() == col2.output_type - assert "b" == col2.output_name - - plan.output(col0, col1, col2) - plan.execute(ctx) - - assert_duck_eq( - ctx.output, - "SELECT c, a, b FROM a", - a=pdf, - check_order=False, - ) - - -def test_lit_op(): - pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) - ctx = MapOperationsContext(PandasUtils(), pdf) - plan = MapExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) - col0 = plan.lit(None) - assert pa.null() == col0.output_type - col1 = plan.lit("abc") - assert pa.string() == col1.output_type - col2 = plan.lit(1, pa.uint8()) - assert pa.uint8() == col2.output_type - col3 = plan.col("a") - - plan.output((col1, "x"), (col2, "y"), col3) - plan.execute(ctx) - - assert_duck_eq( - ctx.output, - "SELECT 'abc' AS x, 1 AS y, a FROM a", - a=pdf, - check_order=False, - ) - - -def test_pure_lit_op(): - pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) - ctx = MapOperationsContext(PandasUtils(), pdf) - plan = MapExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) - col0 = plan.lit(None) - assert pa.null() == col0.output_type - col1 = plan.lit("abc") - assert pa.string() == col1.output_type - col2 = plan.lit(1, pa.uint8()) - assert pa.uint8() == col2.output_type - col3 = plan.lit(b"\0abc") - assert pa.binary() == col3.output_type - col4 = plan.lit([1, 2]) - assert pa.types.is_nested(col4.output_type) - - plan.output((col1, "a"), (col2, "b"), (col3, "c"), (col4, "d")) - plan.execute(ctx) - - expected = [["abc", 1, b"\0abc", [1, 2]], ["abc", 1, b"\0abc", [1, 2]]] - - assert expected == ctx.output.astype(object).values.tolist() - - -def test_unary_op(): - pdf = pd.DataFrame([[0, 1.1, True], [3, 4.1, False]], columns=["a", "b", "c"]) - ctx = MapOperationsContext(PandasUtils(), pdf) - plan = MapExecutionPlan(Schema("a:uint,b:float32,c:bool").pa_schema) - col0 = plan.col("c") - col1 = plan.col("a") - col2 = plan.col("b") - col3 = plan.unary("+", col1) - assert pa.uint32() == col3.output_type - assert "a" == col3.output_name - col4 = plan.unary("-", col1) - assert pa.int64() == col4.output_type - col5 = plan.unary("+", col2) - assert pa.float32() == col5.output_type - col6 = plan.unary("-", col2) - assert pa.float32() == col6.output_type - - raises(ValueError, lambda: plan.unary("-", col0)) - raises(ValueError, lambda: plan.unary("+", col0)) - raises(ValueError, lambda: plan.unary("~", col1)) - raises(ValueError, lambda: plan.unary("~", col2)) - - col10 = plan.unary("~", col0) - plan.output((col3, "c3"), (col4, "c4"), (col5, "c5"), (col6, "c6"), (col10, "c10")) - plan.execute(ctx) - - assert_duck_eq( - ctx.output, - """ - SELECT - a AS c3, -a AS c4, - b AS c5, -b AS c6, - NOT c AS c10 - FROM a - """, - a=pdf, - check_order=False, - ) - - -def test_binary_op_num(): - pdf = pd.DataFrame([[1, 1.1], [3, 4.1]], columns=["a", "b"]) - ctx = MapOperationsContext(PandasUtils(), pdf) - plan = MapExecutionPlan(Schema("a:uint,b:float32").pa_schema) - col1 = plan.col("a") - col2 = plan.col("b") - cola = plan.binary("+", col1, col1) - assert pa.int64() == cola.output_type - colb = plan.binary("-", col1, col1) - assert pa.int64() == colb.output_type - colc = plan.binary("*", col1, col1) - assert pa.int64() == colc.output_type - cold = plan.binary("/", col1, plan.lit(2)) - assert pa.int64() == cold.output_type - - cole = plan.binary("+", col1, col2) - assert pa.float64() == cole.output_type - colf = plan.binary("-", col1, col2) - assert pa.float64() == colf.output_type - colg = plan.binary("*", col1, col2) - assert pa.float64() == colg.output_type - colh = plan.binary("/", col1, col2) - assert pa.float64() == colh.output_type - - coli = plan.binary("+", col2, col1) - assert pa.float64() == coli.output_type - colj = plan.binary("-", col2, col1) - assert pa.float64() == colj.output_type - colk = plan.binary("*", col2, col1) - assert pa.float64() == colk.output_type - coll = plan.binary("/", col2, col1) - assert pa.float64() == coll.output_type - - colm = plan.binary("+", col2, col2) - assert pa.float64() == colm.output_type - coln = plan.binary("-", col2, col2) - assert pa.float64() == coln.output_type - colo = plan.binary("*", col2, col2) - assert pa.float64() == colo.output_type - colp = plan.binary("/", col2, col2) - assert pa.float64() == colp.output_type - - plan.output( - (cola, "a"), - (colb, "b"), - (colc, "c"), - (cold, "d"), - (cole, "e"), - (colf, "f"), - (colg, "g"), - (colh, "h"), - (coli, "i"), - (colj, "j"), - (colk, "k"), - (coll, "l"), - (colm, "m"), - (coln, "n"), - (colo, "o"), - (colp, "p"), - ) - plan.execute(ctx) - - assert_duck_eq( - ctx.output, - """ - SELECT - a+a AS a, a-a AS b, a*a AS c, a/2 AS d, - a+b AS e, a-b AS f, a*b AS g, a/b AS h, - b+a AS i, b-a AS j, b*a AS k, b/a AS l, - b+b AS m, b-b AS n, b*b AS o, b/b AS p - FROM a - """, - a=pdf, - check_order=False, - ) - - -def test_binary_op_logical(): - pdf = pd.DataFrame( - [ - [True, True], - [True, False], - [True, None], - [False, True], - [False, False], - [False, None], - [None, True], - [None, False], - [None, None], - ], - columns=["a", "b"], - ) - ctx = MapOperationsContext(PandasUtils(), pdf) - plan = MapExecutionPlan(Schema("a:bool,b:bool").pa_schema) - col1 = plan.col("a") - col2 = plan.col("b") - cola = plan.binary("&", col1, col2) - assert pa.bool_() == cola.output_type - colb = plan.binary("|", col1, col2) - assert pa.bool_() == colb.output_type - colc = plan.binary("&", col1, plan.lit(True)) - assert pa.bool_() == colc.output_type - cold = plan.binary("&", col1, plan.lit(False)) - assert pa.bool_() == cold.output_type - cole = plan.binary("&", col1, plan.lit(None)) - assert pa.bool_() == cole.output_type - colf = plan.binary("|", col1, plan.lit(True)) - assert pa.bool_() == colf.output_type - colg = plan.binary("|", col1, plan.lit(False)) - assert pa.bool_() == colg.output_type - colh = plan.binary("|", col1, plan.lit(None)) - assert pa.bool_() == colh.output_type - - plan.output( - (cola, "a"), - (colb, "b"), - (colc, "c"), - (cold, "d"), - (cole, "e"), - (colf, "f"), - (colg, "g"), - (colh, "h"), - ) - plan.execute(ctx) - - assert_duck_eq( - ctx.output, - """ - SELECT - a AND b AS a, a OR b AS b, - a AND TRUE AS c, a AND FALSE AS d, a AND NULL AS e, - a OR TRUE AS f, a OR FALSE AS g, a OR NULL AS h - FROM a - """, - a=pdf, - check_order=False, - ) - - -def test_binary_op_logical_2(): - pdf = pd.DataFrame( - [ - [True, True], - [True, False], - ], - columns=["a", "b"], - ) - ctx = MapOperationsContext(PandasUtils(), pdf) - plan = MapExecutionPlan(Schema("a:bool,b:bool").pa_schema) - output = [] - sql = [] - n = 0 - for op in ["&", "|"]: - for left in [True, False, None]: - for right in [True, False, None]: - name = f"_{n}" - col = plan.binary(op, plan.lit(left), plan.lit(right)) - assert pa.bool_() == col.output_type - output.append((col, name)) - ls = "NULL" if left is None else str(left).upper() - rs = "NULL" if right is None else str(right).upper() - o = "AND" if op == "&" else "OR" - sql.append(f"{ls} {o} {rs} AS {name}") - n += 1 - plan.output(*output) - plan.execute(ctx) - - _sql = ", ".join(sql) - assert_duck_eq( - ctx.output, f"SELECT {_sql} FROM a", a=pdf, check_order=False, debug=True - ) - - -def test_plan(): - pdf = pd.DataFrame([[0, 1.1], [3, 4.1]], columns=["a", "b"]) - ctx = MapOperationsContext(PandasUtils(), pdf) - plan = MapExecutionPlan(Schema("a:int,b:float").pa_schema) - col1 = plan.col("b") - col2 = plan.col("a") - col3 = plan.binary("+", col1, col2) - col4 = plan.binary("-", col3, plan.lit(2)) - l1 = len(plan) - col5 = plan.binary("+", col1, col2) # dedupped - assert l1 == len(plan) - col6 = plan.unary("-", col5) - # a, b, a+b as x, a+b-2 as y, -(a+b) as z - plan.output(col1, col2, (col3, "x"), (col4, "y"), (col6, "z")) - assert Schema(plan.output_schema) == "b:float,a:int,x:double,y:double,z:double" - plan.execute(ctx) - assert_duck_eq( - ctx.output, - "SELECT b, a, a+b AS x, a+b-2 AS y, -(a+b) AS z FROM a", - a=pdf, - check_order=False, - ) - - -def test_plan_uuid(): - plan1 = MapExecutionPlan(Schema("a:int,b:float32").pa_schema) - plan2 = MapExecutionPlan(Schema("a:int,b:float64").pa_schema) - plan3 = MapExecutionPlan(Schema("a:int,b:float64").pa_schema) - assert to_uuid(plan1) != to_uuid(plan2) - assert to_uuid(plan3) == to_uuid(plan2) - - plan2.col("a") - plan3.col("b") - tid = to_uuid(plan2) - assert to_uuid(plan3) != to_uuid(plan2) - - plan2.col("a") - assert tid == to_uuid(plan2) - plan2.col("b") - assert tid != to_uuid(plan2)