diff --git a/.github/renovate.json b/.github/renovate.json index 8760d844ed1c0..90f52c7659666 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -29,10 +29,6 @@ "matchPackagePrefixes": ["clickhouse"], "addLabels": ["clickhouse"] }, - { - "matchPackagePatterns": ["dask"], - "addLabels": ["dask"] - }, { "matchPackagePatterns": ["datafusion"], "addLabels": ["datafusion"] diff --git a/.github/workflows/ibis-backends.yml b/.github/workflows/ibis-backends.yml index d85adf947d958..653680c0896bc 100644 --- a/.github/workflows/ibis-backends.yml +++ b/.github/workflows/ibis-backends.yml @@ -233,13 +233,6 @@ jobs: services: - flink include: - - os: ubuntu-latest - python-version: "3.11.8" - backend: - name: dask - title: Dask - extras: - - dask - os: ubuntu-latest python-version: "3.11" backend: @@ -527,17 +520,6 @@ jobs: - "3.10" - "3.12" backend: - - name: dask - title: Dask - deps: - required: - - "numpy@1.23.5" - - "pyarrow@10.0.1" - optional: - - "dask[array,dataframe]@2022.9.1" - - "pandas@1.5.3" - extras: - - dask - name: postgres title: PostgreSQL deps: @@ -588,19 +570,6 @@ jobs: extras: - postgres - geospatial - - python-version: "3.12" - backend: - name: dask - title: Dask - deps: - required: - - "numpy@1.23.5" - - "pyarrow@10.0.1" - optional: - - "dask[array,dataframe]@2022.9.1" - - "pandas@1.5.3" - extras: - - dask steps: - name: checkout uses: actions/checkout@v4 diff --git a/README.md b/README.md index c91a53739b2e8..6c5059631be78 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,6 @@ Ibis supports 20+ backends: - [Apache PySpark](https://ibis-project.org/backends/pyspark/) - [BigQuery](https://ibis-project.org/backends/bigquery/) - [ClickHouse](https://ibis-project.org/backends/clickhouse/) -- [Dask](https://ibis-project.org/backends/dask/) - [DuckDB](https://ibis-project.org/backends/duckdb/) - [Exasol](https://ibis-project.org/backends/exasol) - [MySQL](https://ibis-project.org/backends/mysql/) diff --git a/conda/environment-arm64-flink.yml b/conda/environment-arm64-flink.yml index 746679da7bd04..a96d78ff73e76 100644 --- a/conda/environment-arm64-flink.yml +++ b/conda/environment-arm64-flink.yml @@ -7,7 +7,6 @@ dependencies: - atpublic >=2.3 - black >=22.1.0,<25 - clickhouse-connect >=0.5.23 - - dask >=2022.9.1 - datafusion >=0.6 - db-dtypes >=0.3.0,<2 - deltalake diff --git a/conda/environment-arm64.yml b/conda/environment-arm64.yml index ef0b5a58e0074..a32123966d4c7 100644 --- a/conda/environment-arm64.yml +++ b/conda/environment-arm64.yml @@ -7,7 +7,6 @@ dependencies: - atpublic >=2.3 - black >=22.1.0,<25 - clickhouse-connect >=0.5.23 - - dask >=2022.9.1 - datafusion >=0.6 - db-dtypes >=0.3.0,<2 - deltalake diff --git a/conda/environment.yml b/conda/environment.yml index 2ed9223e38b6b..fbfb53b30b48e 100644 --- a/conda/environment.yml +++ b/conda/environment.yml @@ -7,7 +7,6 @@ dependencies: - atpublic >=2.3 - black >=22.1.0,<25 - clickhouse-connect >=0.5.23 - - dask >=2022.9.1 - datafusion >=0.6 - db-dtypes >=0.3.0,<2 - deltalake diff --git a/docs/_tabsets/install.qmd b/docs/_tabsets/install.qmd index dbe5b334fe676..e199c4958ecbd 100644 --- a/docs/_tabsets/install.qmd +++ b/docs/_tabsets/install.qmd @@ -10,7 +10,6 @@ from textwrap import dedent backends = [ {"name": "BigQuery", "module": "bigquery"}, {"name": "ClickHouse", "module": "clickhouse"}, - {"name": "Dask", "module": "dask"}, {"name": "DataFusion", "module": "datafusion"}, {"name": "Druid", "module": "druid"}, {"name": "DuckDB", "module": "duckdb"}, diff --git a/docs/backend_table_hiearchy.qmd b/docs/backend_table_hiearchy.qmd index 1f6013e62dce9..741ecb0af1dc5 100644 --- a/docs/backend_table_hiearchy.qmd +++ b/docs/backend_table_hiearchy.qmd @@ -20,7 +20,6 @@ use the terms `catalog` and `database` and map them onto the appropriate fields. |------------|----------------|------------| | bigquery | project | database | | clickhouse | | database | -| dask | | NA | | datafusion | catalog | schema | | druid | dataSourceType | dataSource | | duckdb | database | schema | diff --git a/docs/backends/dask.qmd b/docs/backends/dask.qmd index e2fa0474e0029..4176ec4163140 100644 --- a/docs/backends/dask.qmd +++ b/docs/backends/dask.qmd @@ -1,74 +1,4 @@ # Dask -[https://www.dask.org](https://www.dask.org) - -![](https://img.shields.io/badge/memtables-native-green?style=flat-square) ![](https://img.shields.io/badge/inputs-CSV | Parquet-blue?style=flat-square) ![](https://img.shields.io/badge/outputs-CSV | pandas | Parquet | PyArrow-orange?style=flat-square) - -## Install - -Install Ibis and dependencies for the Dask backend: - -::: {.panel-tabset} - -## `pip` - -Install with the `dask` extra: - -```{.bash} -pip install 'ibis-framework[dask]' -``` - -And connect: - -```{.python} -import ibis - -con = ibis.dask.connect() # <1> -``` - -1. Adjust connection parameters as needed. - -## `conda` - -Install for Dask: - -```{.bash} -conda install -c conda-forge ibis-dask -``` - -And connect: - -```{.python} -import ibis - -con = ibis.dask.connect() # <1> -``` - -1. Adjust connection parameters as needed. - -## `mamba` - -Install for Dask: - -```{.bash} -mamba install -c conda-forge ibis-dask -``` - -And connect: - -```{.python} -import ibis - -con = ibis.dask.connect() # <1> -``` - -1. Adjust connection parameters as needed. - -::: - -```{python} -#| echo: false -BACKEND = "Dask" -``` - -{{< include ./_templates/api.qmd >}} +The Dask backend was removed in Ibis version 10.0, due to lack of interest from +the community and an increasing maintenance burden. diff --git a/docs/why.qmd b/docs/why.qmd index 1f9078f70b17d..cc80afab84a97 100644 --- a/docs/why.qmd +++ b/docs/why.qmd @@ -324,7 +324,6 @@ use Ibis with other tools over time. Ibis already works with other Python dataframes like: - [pandas](https://github.com/pandas-dev/pandas) -- [Dask](https://github.com/dask/dask) - [Polars](https://github.com/pola-rs/polars) Ibis already works well with visualization libraries like: diff --git a/flake.nix b/flake.nix index ad28f49dae350..7ff1da9a58891 100644 --- a/flake.nix +++ b/flake.nix @@ -138,9 +138,7 @@ ibis311 = mkDevShell pkgs.ibisDevEnv311; ibis312 = mkDevShell pkgs.ibisDevEnv312; - # move back to 3.12 when dask-expr is supported or the dask backend is - # removed - default = ibis310; + default = ibis312; preCommit = pkgs.mkShell { name = "preCommit"; diff --git a/ibis/backends/conftest.py b/ibis/backends/conftest.py index 1d69af343588b..8ec2177893e69 100644 --- a/ibis/backends/conftest.py +++ b/ibis/backends/conftest.py @@ -463,7 +463,7 @@ def _setup_backend(request, data_dir, tmp_path_factory, worker_id): @pytest.fixture( - params=_get_backends_to_test(discard=("dask", "pandas")), + params=_get_backends_to_test(discard=("pandas",)), scope="session", ) def ddl_backend(request, data_dir, tmp_path_factory, worker_id): @@ -478,7 +478,7 @@ def ddl_con(ddl_backend): @pytest.fixture( - params=_get_backends_to_test(keep=("dask", "pandas", "pyspark")), + params=_get_backends_to_test(keep=("pandas", "pyspark")), scope="session", ) def udf_backend(request, data_dir, tmp_path_factory, worker_id): diff --git a/ibis/backends/dask/__init__.py b/ibis/backends/dask/__init__.py deleted file mode 100644 index b8e217affccb0..0000000000000 --- a/ibis/backends/dask/__init__.py +++ /dev/null @@ -1,185 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -import dask -import dask.dataframe as dd -import pandas as pd - -import ibis.common.exceptions as com - -# import the pandas execution module to register dispatched implementations of -# execute_node that the dask backend will later override -import ibis.expr.types as ir -from ibis import util -from ibis.backends import NoUrl -from ibis.backends.pandas import BasePandasBackend -from ibis.formats.pandas import PandasData - -if TYPE_CHECKING: - import pathlib - from collections.abc import Mapping, MutableMapping - - -class Backend(BasePandasBackend, NoUrl): - name = "dask" - backend_table_type = dd.DataFrame - supports_in_memory_tables = False - - def do_connect( - self, - dictionary: MutableMapping[str, dd.DataFrame] | None = None, - ) -> None: - """Construct a Dask backend client from a dictionary of data sources. - - Parameters - ---------- - dictionary - An optional mapping from `str` table names to Dask DataFrames. - - Examples - -------- - >>> import ibis - >>> import dask.dataframe as dd - >>> data = { - ... "t": dd.read_parquet("path/to/file.parquet"), - ... "s": dd.read_csv("path/to/file.csv"), - ... } - >>> ibis.dask.connect(data) - - """ - if dictionary is None: - dictionary = {} - - for k, v in dictionary.items(): - if not isinstance(v, (dd.DataFrame, pd.DataFrame)): - raise TypeError( - f"Expected an instance of 'dask.dataframe.DataFrame' for {k!r}," - f" got an instance of '{type(v).__name__}' instead." - ) - super().do_connect(dictionary) - - def disconnect(self) -> None: - pass - - @property - def version(self): - return dask.__version__ - - def _validate_args(self, expr, limit): - if limit != "default" and limit is not None: - raise com.UnsupportedArgumentError( - "limit parameter to execute is not yet implemented in the " - "dask backend" - ) - if not isinstance(expr, ir.Expr): - raise TypeError( - f"`expr` has type {type(expr).__name__!r}, expected ibis.expr.types.Expr" - ) - - def compile( - self, - expr: ir.Expr, - params: dict | None = None, - limit: int | None = None, - **kwargs, - ): - from ibis.backends.dask.executor import DaskExecutor - - self._validate_args(expr, limit) - params = params or {} - params = {k.op() if isinstance(k, ir.Expr) else k: v for k, v in params.items()} - - return DaskExecutor.compile(expr.op(), backend=self, params=params) - - def execute( - self, - expr: ir.Expr, - params: Mapping[ir.Expr, object] | None = None, - limit: str = "default", - **kwargs, - ): - from ibis.backends.dask.executor import DaskExecutor - - self._validate_args(expr, limit) - params = params or {} - params = {k.op() if isinstance(k, ir.Expr) else k: v for k, v in params.items()} - - return DaskExecutor.execute(expr.op(), backend=self, params=params) - - def read_csv( - self, source: str | pathlib.Path, table_name: str | None = None, **kwargs: Any - ): - """Register a CSV file as a table in the current session. - - Parameters - ---------- - source - The data source. Can be a local or remote file, pathlike objects - also accepted. - table_name - An optional name to use for the created table. This defaults to - a generated name. - **kwargs - Additional keyword arguments passed to Pandas loading function. - See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html - for more information. - - Returns - ------- - ir.Table - The just-registered table - - """ - table_name = table_name or util.gen_name("read_csv") - df = dd.read_csv(source, **kwargs) - self.dictionary[table_name] = df - return self.table(table_name) - - def read_parquet( - self, source: str | pathlib.Path, table_name: str | None = None, **kwargs: Any - ): - """Register a parquet file as a table in the current session. - - Parameters - ---------- - source - The data source(s). May be a path to a file, an iterable of files, - or directory of parquet files. - table_name - An optional name to use for the created table. This defaults to - a generated name. - **kwargs - Additional keyword arguments passed to Pandas loading function. - See https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html - for more information. - - Returns - ------- - ir.Table - The just-registered table - - """ - table_name = table_name or util.gen_name("read_parquet") - df = dd.read_parquet(source, **kwargs) - self.dictionary[table_name] = df - return self.table(table_name) - - def get_schema(self, table_name, *, database=None): - try: - schema = self.schemas[table_name] - except KeyError: - df = self.dictionary[table_name] - self.schemas[table_name] = schema = PandasData.infer_table(df.head(1)) - - return schema - - def _convert_object(self, obj) -> dd.DataFrame: - if isinstance(obj, dd.DataFrame): - return obj - - pandas_df = super()._convert_object(obj) - return dd.from_pandas(pandas_df, npartitions=1) - - def _load_into_cache(self, name, expr): - self.create_table(name, self.compile(expr).persist()) diff --git a/ibis/backends/dask/convert.py b/ibis/backends/dask/convert.py deleted file mode 100644 index d74758ee0d98e..0000000000000 --- a/ibis/backends/dask/convert.py +++ /dev/null @@ -1,84 +0,0 @@ -from __future__ import annotations - -import dask.dataframe as dd -import numpy as np -import pandas as pd -import pandas.api.types as pdt - -import ibis.expr.datatypes as dt -from ibis.backends.pandas.convert import PandasConverter -from ibis.formats.pandas import DataMapper, PandasType - - -class DaskConverter(DataMapper): - @classmethod - def convert_scalar(cls, obj, dtype): - return PandasConverter.convert_scalar(obj, dtype) - - @classmethod - def convert_column(cls, obj, dtype): - pandas_type = PandasType.from_ibis(dtype) - - method_name = f"convert_{dtype.__class__.__name__}" - convert_method = getattr(cls, method_name, cls.convert_default) - - return convert_method(obj, dtype, pandas_type) - - @classmethod - def convert_default(cls, s, dtype, pandas_type): - if pandas_type == np.object_: - func = lambda x: x if x is pd.NA else dt.normalize(dtype, x) - meta = (s.name, pandas_type) - return s.map(func, na_action="ignore", meta=meta).astype(pandas_type) - else: - return s.astype(pandas_type) - - @classmethod - def convert_Integer(cls, s, dtype, pandas_type): - if pdt.is_datetime64_any_dtype(s.dtype): - return s.astype("int64").floordiv(int(1e9)).astype(pandas_type) - else: - return s.astype(pandas_type) - - convert_SignedInteger = convert_UnsignedInteger = convert_Integer - convert_Int64 = convert_Int32 = convert_Int16 = convert_Int8 = convert_SignedInteger - convert_UInt64 = convert_UInt32 = convert_UInt16 = convert_UInt8 = ( - convert_UnsignedInteger - ) - - @classmethod - def convert_Floating(cls, s, dtype, pandas_type): - if pdt.is_datetime64_any_dtype(s.dtype): - return s.astype("int64").floordiv(int(1e9)).astype(pandas_type) - else: - return s.astype(pandas_type) - - convert_Float64 = convert_Float32 = convert_Float16 = convert_Floating - - @classmethod - def convert_Timestamp(cls, s, dtype, pandas_type): - if isinstance(s.dtype, pd.DatetimeTZDtype): - return s.dt.tz_convert(dtype.timezone) - elif pdt.is_datetime64_dtype(s.dtype): - return s.dt.tz_localize(dtype.timezone) - elif pdt.is_numeric_dtype(s.dtype): - return dd.to_datetime(s, unit="s").dt.tz_localize(dtype.timezone) - else: - return dd.to_datetime(s, utc=True).dt.tz_localize(dtype.timezone) - - @classmethod - def convert_Date(cls, s, dtype, pandas_type): - if isinstance(s.dtype, pd.DatetimeTZDtype): - s = s.dt.tz_convert("UTC").dt.tz_localize(None) - elif pdt.is_numeric_dtype(s.dtype): - s = dd.to_datetime(s, unit="D") - else: - s = dd.to_datetime(s) - - return s.dt.normalize() - - @classmethod - def convert_String(cls, s, dtype, pandas_type): - # TODO(kszucs): should switch to the new pandas string type and convert - # object columns using s.convert_dtypes() method - return s.map(str, na_action="ignore").astype(object) diff --git a/ibis/backends/dask/executor.py b/ibis/backends/dask/executor.py deleted file mode 100644 index 12d975d799667..0000000000000 --- a/ibis/backends/dask/executor.py +++ /dev/null @@ -1,448 +0,0 @@ -from __future__ import annotations - -import operator -from functools import reduce - -import dask.array as da -import dask.dataframe as dd -import numpy as np -import pandas as pd - -import ibis.backends.dask.kernels as dask_kernels -import ibis.expr.operations as ops -from ibis.backends.dask.convert import DaskConverter -from ibis.backends.dask.helpers import ( - DaskUtils, - add_globally_consecutive_column, -) -from ibis.backends.pandas.executor import PandasExecutor -from ibis.backends.pandas.rewrites import ( - PandasAggregate, - PandasJoin, - PandasLimit, - PandasResetIndex, - PandasScalarSubquery, - PandasWindowFrame, - PandasWindowFunction, - plan, -) -from ibis.common.exceptions import UnboundExpressionError, UnsupportedOperationError -from ibis.formats.pandas import PandasData, PandasType -from ibis.util import gen_name - -# ruff: noqa: F811 - - -def limit_df( - df: dd.DataFrame, - col: str, - n: int | pd.DataFrame, - offset: int | pd.DataFrame, -): - if isinstance(offset, pd.DataFrame): - offset = offset.iat[0, 0] - if isinstance(n, pd.DataFrame): - n = n.iat[0, 0] - - if n is None: - return df[df[col] >= offset] - - return df[df[col].between(offset, offset + n - 1)] - - -def argminmax_chunk(df, keycol, valcol, method): - idx = getattr(df[keycol], method)() - return df[[keycol, valcol]].iloc[idx : idx + 1] - - -def argminmax_aggregate(df, keycol, valcol, method): - return df[valcol].iloc[getattr(df[keycol], method)()] - - -class DaskExecutor(PandasExecutor, DaskUtils): - name = "dask" - kernels = dask_kernels - - @classmethod - def visit(cls, op: ops.Node, **kwargs): - return super().visit(op, **kwargs) - - @classmethod - def visit(cls, op: ops.Cast, arg, to): - if arg is None: - return None - elif isinstance(arg, dd.Series): - return DaskConverter.convert_column(arg, to) - else: - return DaskConverter.convert_scalar(arg, to) - - @classmethod - def visit( - cls, op: ops.SimpleCase | ops.SearchedCase, cases, results, default, base=None - ): - def mapper(df, cases, results, default): - cases = [case.astype("bool") for case in cases] - cases.append(pd.Series(True, index=df.index)) - - results.append(default) - out = np.select(cases, results) - - return pd.Series(out, index=df.index) - - dtype = PandasType.from_ibis(op.dtype) - if base is not None: - cases = tuple(base == case for case in cases) - kwargs = dict(cases=cases, results=results, default=default) - - return cls.partitionwise(mapper, kwargs, name=op.name, dtype=dtype) - - @classmethod - def visit(cls, op: ops.IntervalFromInteger, unit, **kwargs): - if unit.short in {"Y", "Q", "M", "W"}: - return cls.elementwise( - lambda v: pd.DateOffset(**{unit.plural: v}), - kwargs, - name=op.name, - dtype=object, - ) - else: - return cls.serieswise( - lambda arg: arg.astype(f"timedelta64[{unit.short}]"), kwargs - ) - - @classmethod - def visit(cls, op: ops.BetweenTime, arg, lower_bound, upper_bound): - if getattr(arg.dtype, "tz", None) is not None: - localized = arg.dt.tz_convert("UTC").dt.tz_localize(None) - else: - localized = arg - - time = localized.dt.time.astype(str) - indexer = ((time >= lower_bound) & (time <= upper_bound)).to_dask_array(True) - - result = da.zeros(len(arg), dtype=np.bool_) - result[indexer] = True - return dd.from_array(result) - - @classmethod - def visit(cls, op: ops.FindInSet, needle, values): - def mapper(df, cases): - thens = [i for i, _ in enumerate(cases)] - out = np.select(cases, thens, default=-1) - return pd.Series(out, index=df.index) - - dtype = PandasType.from_ibis(op.dtype) - cases = [needle == value for value in values] - kwargs = dict(cases=cases) - return cls.partitionwise(mapper, kwargs, name=op.name, dtype=dtype) - - @classmethod - def visit(cls, op: ops.Array, exprs): - return cls.rowwise( - lambda row: np.array(row, dtype=object), exprs, name=op.name, dtype=object - ) - - @classmethod - def visit(cls, op: ops.StructColumn, names, values): - return cls.rowwise( - lambda row: dict(zip(names, row)), values, name=op.name, dtype=object - ) - - @classmethod - def visit(cls, op: ops.ArrayConcat, arg): - dtype = PandasType.from_ibis(op.dtype) - return cls.rowwise( - lambda row: np.concatenate(row.values), arg, name=op.name, dtype=dtype - ) - - @classmethod - def visit(cls, op: ops.Unnest, arg): - arg = cls.asseries(arg) - mask = arg.map(lambda v: bool(len(v)), na_action="ignore") - return arg[mask].explode() - - @classmethod - def visit( - cls, op: ops.ElementWiseVectorizedUDF, func, func_args, input_type, return_type - ): - """Execute an elementwise UDF.""" - - def mapper(df): - cols = [df[col] for col in df] - return func(*cols) - - df, _ = cls.asframe(func_args) - result = df.map_partitions(mapper) - if op.dtype.is_struct(): - result = result.apply(lambda row: row.to_dict(), axis=1) - return result - - ############################# Reductions ################################## - - @classmethod - def visit(cls, op: ops.ArgMin | ops.ArgMax, arg, key, where): - method = "argmin" if isinstance(op, ops.ArgMin) else "argmax" - - def agg(df): - if where is not None: - df = df.where(df[where.name]) - - if isinstance(df, dd.DataFrame): - return df.reduction( - chunk=argminmax_chunk, - combine=argminmax_chunk, - aggregate=argminmax_aggregate, - meta=op.dtype.to_pandas(), - token=method, - keycol=key.name, - valcol=arg.name, - method=method, - ) - else: - return argminmax_aggregate(df, key.name, arg.name, method) - - return agg - - @classmethod - def visit(cls, op: ops.Correlation, left, right, where, how): - if how == "pop": - raise UnsupportedOperationError( - "Dask doesn't support `corr` with `how='pop'`" - ) - - def agg(df): - if where is not None: - df = df.where(df[where.name]) - - return df[left.name].corr(df[right.name]) - - return agg - - @classmethod - def visit(cls, op: ops.Covariance, left, right, where, how): - if how == "pop": - raise UnsupportedOperationError( - "Dask doesn't support `cov` with `how='pop'`" - ) - - def agg(df): - if where is not None: - df = df.where(df[where.name]) - - return df[left.name].cov(df[right.name]) - - return agg - - @classmethod - def visit( - cls, op: ops.ReductionVectorizedUDF, func, func_args, input_type, return_type - ): - def agg(df): - # if df is a dask dataframe then we collect it to a pandas dataframe - # because the user-defined function expects a pandas dataframe - if isinstance(df, dd.DataFrame): - df = df.compute() - args = [df[col.name] for col in func_args] - return func(*args) - - return agg - - @classmethod - def visit( - cls, op: ops.AnalyticVectorizedUDF, func, func_args, input_type, return_type - ): - def agg(df, order_keys): - # if df is a dask dataframe then we collect it to a pandas dataframe - # because the user-defined function expects a pandas dataframe - if isinstance(df, dd.DataFrame): - df = df.compute() - args = [df[col.name] for col in func_args] - res = func(*args) - if isinstance(res, pd.DataFrame): - # it is important otherwise it is going to fill up the memory - res = res.apply(lambda row: row.to_dict(), axis=1) - return res - - return agg - - ############################ Window functions ############################# - - @classmethod - def visit(cls, op: PandasWindowFrame, table, start, end, **kwargs): - table = table.compute() - if isinstance(start, dd.Series): - start = start.compute() - if isinstance(end, dd.Series): - end = end.compute() - return super().visit(op, table=table, start=start, end=end, **kwargs) - - @classmethod - def visit(cls, op: PandasWindowFunction, func, frame): - result = super().visit(op, func=func, frame=frame) - return cls.asseries(result) - - ############################ Relational ################################### - - @classmethod - def visit(cls, op: ops.DatabaseTable, name, schema, source, namespace): - try: - return source.dictionary[name] - except KeyError: - raise UnboundExpressionError( - f"{name} is not a table in the {source.name!r} backend, you " - "probably tried to execute an expression without a data source" - ) - - @classmethod - def visit(cls, op: ops.InMemoryTable, name, schema, data): - df = data.to_frame().reset_index(drop=True) - return dd.from_pandas(df, npartitions=1) - - @classmethod - def visit(cls, op: ops.DummyTable, values): - df, _ = cls.asframe(values) - return df - - @classmethod - def visit(cls, op: PandasLimit, parent, n, offset): - name = gen_name("limit") - df = add_globally_consecutive_column(parent, name, set_as_index=False) - - return df.map_partitions( - limit_df, - col=name, - n=n, - offset=offset, - align_dataframes=False, - meta=df._meta, - ).drop(columns=[name]) - - @classmethod - def visit(cls, op: PandasResetIndex, parent): - return add_globally_consecutive_column(parent) - - @classmethod - def visit(cls, op: PandasJoin, **kwargs): - df = super().visit(op, **kwargs) - return add_globally_consecutive_column(df) - - @classmethod - def visit(cls, op: ops.Project, parent, values): - df, all_scalars = cls.asframe(values) - if all_scalars and len(parent) != len(df): - df = dd.concat([df] * len(parent)) - return df - - @classmethod - def visit(cls, op: ops.Filter, parent, predicates): - if predicates: - pred = reduce(operator.and_, predicates) - parent = parent.loc[pred].reset_index(drop=True) - return parent - - @classmethod - def visit(cls, op: ops.Sort, parent, keys): - # 1. add sort key columns to the dataframe if they are not already present - # 2. sort the dataframe using those columns - # 3. drop the sort key columns - ascending = [key.ascending for key in op.keys] - nulls_first = [key.nulls_first for key in op.keys] - - if all(nulls_first): - na_position = "first" - elif not any(nulls_first): - na_position = "last" - else: - raise ValueError( - "dask does not support specifying null ordering for individual columns" - ) - - newcols = {gen_name("sort_key"): col for col in keys} - names = list(newcols.keys()) - df = parent.assign(**newcols) - df = df.sort_values( - by=names, - ascending=ascending, - na_position=na_position, - ) - return df.drop(names, axis=1) - - @classmethod - def visit(cls, op: PandasAggregate, parent, groups, metrics): - if not groups: - results = {k: v(parent) for k, v in metrics.items()} - combined, _ = cls.asframe(results) - return combined - - parent = parent.groupby([col.name for col in groups.values()]) - - measures = {} - for name, metric in metrics.items(): - meta = pd.Series( - name=name, - dtype=PandasType.from_ibis(op.metrics[name].dtype), - index=pd.MultiIndex( - levels=[[] for _ in groups], - codes=[[] for _ in groups], - names=list(groups.keys()), - ), - ) - measures[name] = parent.apply(metric, meta=meta) - - result = cls.concat(measures, axis=1).reset_index() - renames = {v.name: k for k, v in op.groups.items()} - return result.rename(columns=renames) - - @classmethod - def visit(cls, op: ops.InValues, value, options): - if isinstance(value, dd.Series): - return value.isin(options) - else: - return value in options - - @classmethod - def visit(cls, op: ops.InSubquery, rel, needle): - first_column = rel.compute().iloc[:, 0] - if isinstance(needle, dd.Series): - return needle.isin(first_column) - else: - return needle in first_column - - @classmethod - def visit(cls, op: PandasScalarSubquery, rel): - # TODO(kszucs): raise a warning about triggering compute()? - # could the compute be avoided here? - return rel.compute().iat[0, 0] - - @classmethod - def compile(cls, node, backend, params): - def fn(node, _, **kwargs): - return cls.visit(node, **kwargs) - - node = node.to_expr().as_table().op() - node = plan(node, backend=backend, params=params) - return node.map_clear(fn) - - @classmethod - def execute(cls, node, backend, params): - original = node - node = node.to_expr().as_table().op() - result = cls.compile(node, backend=backend, params=params) - - # should happen when the result is empty - if isinstance(result, pd.DataFrame): - assert result.empty - else: - assert isinstance(result, dd.DataFrame) - result = result.compute() - - result = PandasData.convert_table(result, node.schema) - if isinstance(original, ops.Value): - if original.shape.is_scalar(): - return result.iloc[0, 0] - elif original.shape.is_columnar(): - return result.iloc[:, 0] - else: - raise TypeError(f"Unexpected shape: {original.shape}") - else: - return result diff --git a/ibis/backends/dask/helpers.py b/ibis/backends/dask/helpers.py deleted file mode 100644 index fa411f69274f4..0000000000000 --- a/ibis/backends/dask/helpers.py +++ /dev/null @@ -1,190 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import dask.array as da -import dask.dataframe as dd -import numpy as np -import pandas as pd - -from ibis.backends.pandas.helpers import PandasUtils - -if TYPE_CHECKING: - from collections.abc import Callable - - -class DaskUtils(PandasUtils): - @classmethod - def merge(cls, *args, **kwargs): - return dd.merge(*args, **kwargs) - - @classmethod - def merge_asof(cls, *args, **kwargs): - return dd.merge_asof(*args, **kwargs) - - @classmethod - def concat(cls, dfs, **kwargs): - if isinstance(dfs, dict): - dfs = [v.rename(k) for k, v in dfs.items()] - return dd.concat(dfs, **kwargs) - - @classmethod - def asseries(cls, value, like=None): - """Ensure that value is a pandas Series object, broadcast if necessary.""" - - if isinstance(value, dd.Series): - return value - elif isinstance(value, dd.core.Scalar): - # Create a Dask array from the Dask scalar - try: - dtype = value.dtype - except AttributeError: - # @property - # def dtype(self): - # > return self._meta.dtype - # E AttributeError: 'Timestamp' object has no attribute 'dtype' - dtype = object - array = da.from_delayed(value.to_delayed(), (1,), dtype=dtype) - # Create a Dask series from the Dask array - return dd.from_array(array) - elif isinstance(value, pd.Series): - return dd.from_pandas(value, npartitions=1) - elif like is not None: - if isinstance(value, (tuple, list, dict)): - fn = lambda df: pd.Series([value] * len(df), index=df.index) - else: - fn = lambda df: pd.Series(value, index=df.index) - return like.map_partitions(fn) - else: - return dd.from_pandas(pd.Series([value]), npartitions=1) - - @classmethod - def asframe(cls, values: dict | tuple): - # TODO(kszucs): prefer using assign instead of concat - """Construct a DataFrame from a dict or tuple of Series objects.""" - if isinstance(values, dict): - names, values = zip(*values.items()) - elif isinstance(values, tuple): - names = [f"_{i}" for i in range(len(values))] - else: - raise TypeError(f"values must be a dict, or tuple; got {type(values)}") - - all_scalars = True - representative = None - for v in values: - if isinstance(v, dd.Series): - all_scalars = False - representative = v - break - - columns = [cls.asseries(v, like=representative) for v in values] - columns = [v.rename(k) for k, v in zip(names, columns)] - - # dd.concat turns decimal.Decimal("NaN") into np.nan for some reason - df = dd.concat(columns, axis=1) - return df, all_scalars - - @classmethod - def rowwise(cls, func: Callable, operands, name, dtype): - if dtype == np.dtype(" dd.DataFrame: - """Add a column that is globally consecutive across the distributed data. - - By construction, this column is already sorted and can be used to partition - the data. - This column can act as if we had a global index across the distributed data. - This index needs to be consecutive in the range of [0, len(df)), allows - downstream operations to work properly. - The default index of dask dataframes is to be consecutive within each partition. - - Important properties: - - - Each row has a unique id (i.e. a value in this column) - - The global index that's added is consecutive in the same order that the rows currently are in. - - IDs within each partition are already sorted - - We also do not explicitly deal with overflow in the bounds. - - Parameters - ---------- - df: dd.DataFrame - Dataframe to add the column to - name: str - Name of the column to use. Default is _ibis_index - set_as_index: bool - If True, will set the consecutive column as the index. Default is True. - - Returns - ------- - dd.DataFrame - New dask dataframe with sorted partitioned index - - """ - if isinstance(df, dd.Series): - df = df.to_frame() - - if name in df.columns: - raise ValueError(f"Column {name} is already present in DataFrame") - - df = df.assign(**{name: 1}) - df = df.assign(**{name: df[name].cumsum() - 1}) - if set_as_index: - df = df.reset_index(drop=True) - df = df.set_index(name, sorted=True) - - # No elegant way to rename index https://github.com/dask/dask/issues/4950 - df = df.map_partitions(pd.DataFrame.rename_axis, None, axis="index") - - return df diff --git a/ibis/backends/dask/kernels.py b/ibis/backends/dask/kernels.py deleted file mode 100644 index 12a1a782ab01d..0000000000000 --- a/ibis/backends/dask/kernels.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import annotations - -import dask.dataframe as dd -import numpy as np - -import ibis.backends.pandas.kernels as pandas_kernels -import ibis.expr.operations as ops - -generic = pandas_kernels.generic.copy() -columnwise = pandas_kernels.columnwise.copy() -elementwise = pandas_kernels.elementwise.copy() -elementwise_decimal = pandas_kernels.elementwise_decimal.copy() - -rowwise = { - **pandas_kernels.rowwise, - ops.DateAdd: lambda row: row["left"] + row["right"], -} - - -def maybe_pandas_reduction(func): - def inner(df): - return df.reduction(func) if isinstance(df, dd.Series) else func(df) - - return inner - - -reductions = { - **pandas_kernels.reductions, - ops.Mode: lambda x: x.mode().loc[0], - ops.ApproxMedian: lambda x: x.median_approximate(), - ops.BitAnd: lambda x: x.reduction(np.bitwise_and.reduce), - ops.BitOr: lambda x: x.reduction(np.bitwise_or.reduce), - ops.BitXor: lambda x: x.reduction(np.bitwise_xor.reduce), - ops.Arbitrary: lambda x: x.reduction(pandas_kernels.first), - # Window functions are calculated locally using pandas - ops.Last: maybe_pandas_reduction(pandas_kernels.last), - ops.First: maybe_pandas_reduction(pandas_kernels.first), -} - -serieswise = { - **pandas_kernels.serieswise, - ops.StringAscii: lambda arg: arg.map( - ord, na_action="ignore", meta=(arg.name, "int32") - ), - ops.TimestampFromUNIX: lambda arg, unit: dd.to_datetime(arg, unit=unit.short), - ops.DayOfWeekIndex: lambda arg: dd.to_datetime(arg).dt.dayofweek, - ops.DayOfWeekName: lambda arg: dd.to_datetime(arg).dt.day_name(), -} - -# prefer other kernels for the following operations -del generic[ops.IsNull] -del generic[ops.NotNull] -del generic[ops.DateAdd] # must pass metadata -del serieswise[ops.Round] # dask series doesn't have a round() method -del serieswise[ops.Strftime] # doesn't support columnar format strings -del serieswise[ops.Substring] - - -supported_operations = ( - generic.keys() - | columnwise.keys() - | rowwise.keys() - | serieswise.keys() - | elementwise.keys() -) diff --git a/ibis/backends/dask/tests/__init__.py b/ibis/backends/dask/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/ibis/backends/dask/tests/conftest.py b/ibis/backends/dask/tests/conftest.py deleted file mode 100644 index d9cbfd2689f3b..0000000000000 --- a/ibis/backends/dask/tests/conftest.py +++ /dev/null @@ -1,396 +0,0 @@ -from __future__ import annotations - -import decimal -from typing import Any - -import dask -import pandas as pd -import pandas.testing as tm -import pytest - -import ibis -import ibis.expr.datatypes as dt -from ibis.backends.conftest import TEST_TABLES -from ibis.backends.pandas.tests.conftest import TestConf as PandasTest -from ibis.backends.tests.data import array_types, json_types, topk, win - -dd = pytest.importorskip("dask.dataframe") - - -# FIXME Dask issue with non deterministic groupby results, relates to the -# shuffle method on a local cluster. Manually setting the shuffle method -# avoids the issue https://github.com/dask/dask/issues/10034. -dask.config.set({"dataframe.shuffle.method": "tasks"}) - -# TODO: support pyarrow string column types across ibis -dask.config.set({"dataframe.convert-string": False}) - -# It's necessary that NPARTITIONS > 1 in order to test cross partitioning bugs. -NPARTITIONS = 2 - - -@pytest.fixture(scope="module") -def npartitions(): - return NPARTITIONS - - -class TestConf(PandasTest): - supports_structs = False - deps = ("dask.dataframe",) - - @staticmethod - def connect(*, tmpdir, worker_id, **kw): - return ibis.dask.connect(**kw) - - def _load_data(self, **_: Any) -> None: - import dask.dataframe as dd - - con = self.connection - for table_name in TEST_TABLES: - path = self.data_dir / "parquet" / f"{table_name}.parquet" - con.create_table( - table_name, - dd.from_pandas(pd.read_parquet(path), npartitions=NPARTITIONS), - ) - - con.create_table( - "array_types", - dd.from_pandas(array_types, npartitions=NPARTITIONS), - overwrite=True, - ) - con.create_table( - "win", dd.from_pandas(win, npartitions=NPARTITIONS), overwrite=True - ) - con.create_table( - "json_t", - dd.from_pandas(json_types, npartitions=NPARTITIONS), - overwrite=True, - ) - con.create_table( - "topk", - dd.from_pandas(topk.to_pandas(), npartitions=NPARTITIONS), - overwrite=True, - ) - - @classmethod - def assert_series_equal( - cls, left: pd.DataFrame, right: pd.DataFrame, *args: Any, **kwargs: Any - ) -> None: - kwargs.setdefault("check_dtype", cls.check_dtype) - kwargs.setdefault("check_names", cls.check_names) - left = left.reset_index(drop=True) - right = right.reset_index(drop=True) - tm.assert_series_equal(left, right, *args, **kwargs) - - -@pytest.fixture -def dataframe(npartitions): - dd = pytest.importorskip("dask.dataframe") - - return dd.from_pandas( - pd.DataFrame( - { - "plain_int64": list(range(1, 4)), - "plain_strings": list("abc"), - "dup_strings": list("dad"), - } - ), - npartitions=npartitions, - ) - - -@pytest.fixture -def con(dataframe): - return ibis.dask.connect({"df": dataframe}) - - -@pytest.fixture -def ibis_table(con): - return con.table("df") - - -@pytest.fixture(scope="module") -def pandas_df(): - return pd.DataFrame( - { - "plain_int64": list(range(1, 4)), - "plain_strings": list("abc"), - "plain_float64": [4.0, 5.0, 6.0], - "plain_datetimes_naive": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ), - "plain_datetimes_ny": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).dt.tz_localize("America/New_York"), - "plain_datetimes_utc": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).dt.tz_localize("UTC"), - "dup_strings": list("dad"), - "dup_ints": [1, 2, 1], - "float64_as_strings": ["100.01", "234.23", "-999.34"], - "int64_as_strings": list(map(str, range(1, 4))), - "strings_with_space": [" ", "abab", "ddeeffgg"], - "int64_with_zeros": [0, 1, 0], - "float64_with_zeros": [1.0, 0.0, 1.0], - "float64_positive": [1.0, 2.0, 1.0], - "strings_with_nulls": ["a", None, "b"], - "datetime_strings_naive": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).astype(str), - "datetime_strings_ny": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ) - .dt.tz_localize("America/New_York") - .astype(str), - "datetime_strings_utc": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ) - .dt.tz_localize("UTC") - .astype(str), - "decimal": list(map(decimal.Decimal, ["1.0", "2", "3.234"])), - "array_of_float64": [[1.0, 2.0], [3.0], []], - "array_of_int64": [[1, 2], [], [3]], - "array_of_strings": [["a", "b"], [], ["c"]], - "map_of_strings_integers": [{"a": 1, "b": 2}, None, {}], - "map_of_integers_strings": [{}, None, {1: "a", 2: "b"}], - "map_of_complex_values": [None, {"a": [1, 2, 3], "b": []}, {}], - } - ) - - -@pytest.fixture(scope="module") -def df(npartitions, pandas_df): - return dd.from_pandas(pandas_df, npartitions=npartitions) - - -@pytest.fixture(scope="module") -def batting_pandas_df(data_dir): - num_rows = 1000 - start_index = 30 - df = pd.read_parquet(data_dir / "parquet" / "batting.parquet").iloc[ - start_index : start_index + num_rows - ] - return df.reset_index(drop=True) - - -@pytest.fixture(scope="module") -def batting_df(npartitions, batting_pandas_df): - return dd.from_pandas(batting_pandas_df, npartitions=npartitions) - - -@pytest.fixture(scope="module") -def awards_players_df(data_dir): - return dd.read_parquet(data_dir / "parquet" / "awards_players.parquet") - - -@pytest.fixture(scope="module") -def df1(npartitions): - pandas_df = pd.DataFrame( - {"key": list("abcd"), "value": [3, 4, 5, 6], "key2": list("eeff")} - ) - return dd.from_pandas(pandas_df, npartitions=npartitions) - - -@pytest.fixture(scope="module") -def df2(npartitions): - pandas_df = pd.DataFrame( - {"key": list("ac"), "other_value": [4.0, 6.0], "key3": list("fe")} - ) - return dd.from_pandas(pandas_df, npartitions=npartitions) - - -@pytest.fixture(scope="module") -def intersect_df2(npartitions): - pandas_df = pd.DataFrame({"key": list("cd"), "value": [5, 6], "key2": list("ff")}) - return dd.from_pandas(pandas_df, npartitions=npartitions) - - -@pytest.fixture(scope="module") -def time_df1(npartitions): - pandas_df = pd.DataFrame( - {"time": pd.to_datetime([1, 2, 3, 4]), "value": [1.1, 2.2, 3.3, 4.4]} - ) - return dd.from_pandas(pandas_df, npartitions=npartitions) - - -@pytest.fixture(scope="module") -def time_df2(npartitions): - pandas_df = pd.DataFrame( - {"time": pd.to_datetime([2, 4]), "other_value": [1.2, 2.0]} - ) - return dd.from_pandas(pandas_df, npartitions=npartitions) - - -@pytest.fixture(scope="module") -def time_df3(npartitions): - pandas_df = pd.DataFrame( - { - "time": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=8).values - ), - "id": list(range(1, 9)), - "value": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8], - } - ) - return dd.from_pandas(pandas_df, npartitions=npartitions) - - -@pytest.fixture(scope="module") -def time_keyed_df1(npartitions): - pandas_df = pd.DataFrame( - { - "time": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=6).values - ), - "key": [1, 2, 3, 1, 2, 3], - "value": [1.2, 1.4, 2.0, 4.0, 8.0, 16.0], - } - ) - return dd.from_pandas(pandas_df, npartitions=npartitions) - - -@pytest.fixture(scope="module") -def time_keyed_df2(npartitions): - pandas_df = pd.DataFrame( - { - "time": pd.Series( - pd.date_range( - start="2017-01-02 01:02:03.234", freq="3D", periods=3 - ).values - ), - "key": [1, 2, 3], - "other_value": [1.1, 1.2, 2.2], - } - ) - return dd.from_pandas(pandas_df, npartitions=npartitions) - - -@pytest.fixture(scope="module") -def client( - df, - df1, - df2, - df3, - time_df1, - time_df2, - time_df3, - time_keyed_df1, - time_keyed_df2, - intersect_df2, -): - return ibis.dask.connect( - { - "df": df, - "df1": df1, - "df2": df2, - "df3": df3, - "left": df1, - "right": df2, - "time_df1": time_df1, - "time_df2": time_df2, - "time_df3": time_df3, - "time_keyed_df1": time_keyed_df1, - "time_keyed_df2": time_keyed_df2, - "intersect_df2": intersect_df2, - } - ) - - -@pytest.fixture(scope="module") -def df3(npartitions): - pandas_df = pd.DataFrame( - { - "key": list("ac"), - "other_value": [4.0, 6.0], - "key2": list("ae"), - "key3": list("fe"), - } - ) - return dd.from_pandas(pandas_df, npartitions=npartitions) - - -t_schema = { - "decimal": dt.Decimal(4, 3), - "array_of_float64": dt.Array(dt.double), - "array_of_int64": dt.Array(dt.int64), - "array_of_strings": dt.Array(dt.string), - "map_of_strings_integers": dt.Map(dt.string, dt.int64), - "map_of_integers_strings": dt.Map(dt.int64, dt.string), - "map_of_complex_values": dt.Map(dt.string, dt.Array(dt.int64)), -} - - -@pytest.fixture(scope="module") -def t(client): - return client.table("df", schema=t_schema) - - -@pytest.fixture(scope="module") -def lahman(batting_df, awards_players_df): - return ibis.dask.connect( - {"batting": batting_df, "awards_players": awards_players_df} - ) - - -@pytest.fixture(scope="module") -def left(client): - return client.table("left") - - -@pytest.fixture(scope="module") -def right(client): - return client.table("right") - - -@pytest.fixture(scope="module") -def time_left(client): - return client.table("time_df1") - - -@pytest.fixture(scope="module") -def time_right(client): - return client.table("time_df2") - - -@pytest.fixture(scope="module") -def time_table(client): - return client.table("time_df3") - - -@pytest.fixture(scope="module") -def time_keyed_left(client): - return client.table("time_keyed_df1") - - -@pytest.fixture(scope="module") -def time_keyed_right(client): - return client.table("time_keyed_df2") - - -@pytest.fixture(scope="module") -def batting(lahman): - return lahman.table("batting") - - -@pytest.fixture(scope="module") -def sel_cols(batting): - cols = batting.columns - start, end = cols.index("AB"), cols.index("H") + 1 - return ["playerID", "yearID", "teamID", "G"] + cols[start:end] - - -@pytest.fixture(scope="module") -def players_base(batting, sel_cols): - # TODO Dask doesn't support order_by and group_by yet - # Adding an order by would cause all groupby tests to fail. - return batting[sel_cols] # .order_by(sel_cols[:3]) - - -@pytest.fixture(scope="module") -def players(players_base): - return players_base.group_by("playerID") - - -@pytest.fixture(scope="module") -def players_df(players_base): - return players_base.execute().reset_index(drop=True) diff --git a/ibis/backends/dask/tests/test_arrays.py b/ibis/backends/dask/tests/test_arrays.py deleted file mode 100644 index 107cca5fedaa6..0000000000000 --- a/ibis/backends/dask/tests/test_arrays.py +++ /dev/null @@ -1,202 +0,0 @@ -from __future__ import annotations - -import operator - -import numpy as np -import pandas as pd -import pytest - -import ibis - -dd = pytest.importorskip("dask.dataframe") -from dask.dataframe.utils import tm # noqa: E402 - - -def test_array_length(t): - expr = t.select( - t.array_of_float64.length().name("array_of_float64_length"), - t.array_of_int64.length().name("array_of_int64_length"), - t.array_of_strings.length().name("array_of_strings_length"), - ) - result = expr.execute() - expected = pd.DataFrame( - { - "array_of_float64_length": [2, 1, 0], - "array_of_int64_length": [2, 0, 1], - "array_of_strings_length": [2, 0, 1], - } - ) - - tm.assert_frame_equal( - result.reset_index(drop=True), expected.reset_index(drop=True) - ) - - -def test_array_length_scalar(client): - raw_value = [1, 2, 4] - value = ibis.literal(raw_value) - expr = value.length() - result = client.execute(expr) - expected = len(raw_value) - assert result == expected - - -def test_array_collect(t, df): - expr = t.group_by(t.dup_strings).aggregate(collected=t.float64_with_zeros.collect()) - result = expr.compile() - expected = ( - df.groupby("dup_strings") - .float64_with_zeros.apply(list) - .reset_index() - .rename(columns={"float64_with_zeros": "collected"}) - ) - tm.assert_frame_equal( - result.compute().sort_values(["dup_strings"]).reset_index(drop=True), - expected.compute().sort_values(["dup_strings"]).reset_index(drop=True), - ) - - -def test_array_collect_rolling_partitioned(t, df): - window = ibis.trailing_window(1, order_by=t.plain_int64) - colexpr = t.plain_float64.collect().over(window) - expr = t["dup_strings", "plain_int64", colexpr.name("collected")] - result = expr.compile() - expected = dd.from_pandas( - pd.DataFrame( - { - "dup_strings": ["d", "a", "d"], - "plain_int64": [1, 2, 3], - "collected": [[4.0], [4.0, 5.0], [5.0, 6.0]], - } - ), - npartitions=1, - )[expr.columns] - tm.assert_frame_equal( - result.compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -# Need an ops.ArraySlice execution func that dispatches on dd.Series -@pytest.mark.notimpl(["dask"], reason="arrays - #2553") -@pytest.mark.parametrize( - ["start", "stop"], - [ - (1, 3), - (1, 1), - (2, 3), - (2, 5), - (None, 3), - (None, None), - (3, None), - (-3, None), - (None, -3), - (-3, -1), - ], -) -def test_array_slice(t, df, start, stop): - expr = t.array_of_strings[start:stop] - result = expr.compile() - slicer = operator.itemgetter(slice(start, stop)) - expected = df.array_of_strings.apply(slicer) - tm.assert_series_equal( - result.compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -@pytest.mark.parametrize( - ["start", "stop"], - [ - (1, 3), - (1, 1), - (2, 3), - (2, 5), - (None, 3), - (None, None), - (3, None), - (-3, None), - (None, -3), - (-3, -1), - ], -) -def test_array_slice_scalar(client, start, stop): - raw_value = [-11, 42, 10] - value = ibis.literal(raw_value) - expr = value[start:stop] - result = client.execute(expr) - expected = raw_value[start:stop] - assert np.array_equal(result, expected) - - -@pytest.mark.parametrize( - "index", - [1, 3, 4, 11, -11], -) -def test_array_index(t, df, index): - expr = t[t.array_of_float64[index].name("indexed")] - result = expr.execute() - expected = pd.DataFrame( - { - "indexed": df.array_of_float64.apply( - lambda x: x[index] if -len(x) <= index < len(x) else np.nan, - meta=("array_of_float64", "object"), - ) - } - ) - - tm.assert_frame_equal( - result.reset_index(drop=True), expected.reset_index(drop=True) - ) - - -@pytest.mark.parametrize("index", [1, 3, 4, 11]) -def test_array_index_scalar(client, index): - raw_value = [-10, 1, 2, 42] - value = ibis.literal(raw_value) - expr = value[index] - result = client.execute(expr) - expected = raw_value[index] if index < len(raw_value) else None - assert result == expected - - -@pytest.mark.parametrize("n", [1, 3, 4, 7, -2]) # negative returns empty list -@pytest.mark.parametrize("mul", [lambda x, n: x * n, lambda x, n: n * x]) -def test_array_repeat(t, df, n, mul): - expr = t.select(repeated=mul(t.array_of_strings, n)) - result = expr.execute() - expected = pd.DataFrame({"repeated": df.array_of_strings * n}) - tm.assert_frame_equal(result, expected) - - -# ValueError: Dask backend borrows Pandas backend's Cast execution -# function, which assumes array representation is np.array. -# NotImplementedError: Need an ops.ArrayConcat execution func that -# dispatches on dd.Series -@pytest.mark.notimpl(["dask"], reason="arrays - #2553") -@pytest.mark.parametrize("op", [lambda x, y: x + y, lambda x, y: y + x]) -def test_array_concat(t, df, op): - x = t.array_of_float64.cast("array") - y = t.array_of_strings - expr = op(x, y) - result = expr.compile() - expected = op( - df.array_of_float64.apply(lambda x: list(map(str, x))), - df.array_of_strings, - ) - tm.assert_series_equal( - result.compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -@pytest.mark.parametrize("op", [lambda x, y: x + y, lambda x, y: y + x]) -def test_array_concat_scalar(client, op): - raw_left = [1, 2, 3] - raw_right = [3, 4] - left = ibis.literal(raw_left) - right = ibis.literal(raw_right) - expr = op(left, right) - result = client.execute(expr) - expected = op(raw_left, raw_right) - assert np.array_equal(result, expected) diff --git a/ibis/backends/dask/tests/test_cast.py b/ibis/backends/dask/tests/test_cast.py deleted file mode 100644 index 23187b59abf96..0000000000000 --- a/ibis/backends/dask/tests/test_cast.py +++ /dev/null @@ -1,178 +0,0 @@ -from __future__ import annotations - -import decimal - -import pandas as pd -import pytest -import pytz -from pytest import param - -import ibis -import ibis.expr.datatypes as dt -from ibis.backends.conftest import is_older_than - -pytest.importorskip("dask.dataframe") -from dask.dataframe.utils import tm # noqa: E402 - -TIMESTAMP = "2022-03-13 06:59:10.467417" - - -@pytest.mark.parametrize("from_", ["plain_float64", "plain_int64"]) -@pytest.mark.parametrize( - ("to", "expected"), - [ - ("float16", "float16"), - ("float32", "float32"), - ("float64", "float64"), - ("double", "float64"), - ("float", "float64"), - ("int8", "int8"), - ("int16", "int16"), - ("int32", "int32"), - ("int64", "int64"), - ("string", "object"), - ], -) -def test_cast_numeric(t, df, from_, to, expected): - c = t[from_].cast(to) - result = c.execute() - assert str(result.dtype) == expected - - -@pytest.mark.parametrize("from_", ["float64_as_strings", "int64_as_strings"]) -@pytest.mark.parametrize( - ("to", "expected"), [("double", "float64"), ("string", "object")] -) -def test_cast_string(t, df, from_, to, expected): - c = t[from_].cast(to) - result = c.execute() - assert str(result.dtype) == expected - - -@pytest.mark.parametrize( - ("to", "expected"), - [ - pytest.param( - "string", - "object", - marks=pytest.mark.skipif( - is_older_than("pandas", "2.1.0"), reason="raises a NotImplementedError" - ), - ), - ("int64", "int64"), - ( - dt.Timestamp("America/Los_Angeles"), - "datetime64[ns, America/Los_Angeles]", - ), - ( - "timestamp('America/Los_Angeles')", - "datetime64[ns, America/Los_Angeles]", - ), - ], -) -@pytest.mark.parametrize( - "column", - ["plain_datetimes_naive", "plain_datetimes_ny", "plain_datetimes_utc"], -) -def test_cast_timestamp_column(t, df, column, to, expected): - c = t[column].cast(to) - result = c.execute() - assert str(result.dtype) == expected - - -@pytest.mark.parametrize( - ("to", "expected"), - [ - pytest.param( - "string", - str, - marks=pytest.mark.skipif( - is_older_than("pandas", "2.1.0"), reason="raises a NotImplementedError" - ), - ), - ("int64", lambda x: pd.Timestamp(x).value // int(1e9)), - ("double", lambda x: float(pd.Timestamp(x).value // int(1e9))), - ( - dt.Timestamp("America/Los_Angeles"), - lambda x: x.tz_localize(tz="America/Los_Angeles"), - ), - ], -) -def test_cast_timestamp_scalar_naive(con, to, expected): - literal_expr = ibis.literal(pd.Timestamp(TIMESTAMP)) - value = literal_expr.cast(to) - result = con.execute(value) - raw = con.execute(literal_expr) - assert result == expected(raw) - - -@pytest.mark.parametrize( - ("to", "expected"), - [ - pytest.param( - "string", - str, - marks=pytest.mark.skipif( - is_older_than("pandas", "2.1.0"), reason="raises a NotImplementedError" - ), - ), - ("int64", lambda x: pd.Timestamp(x).value // int(1e9)), - param("double", float, marks=pytest.mark.notimpl(["dask"])), - ( - dt.Timestamp("America/Los_Angeles"), - lambda x: x.astimezone(tz=pytz.timezone("America/Los_Angeles")), - ), - ], -) -@pytest.mark.parametrize("tz", ["UTC", "America/New_York"]) -def test_cast_timestamp_scalar(to, expected, tz, con): - literal_expr = ibis.literal(pd.Timestamp(TIMESTAMP).tz_localize(tz)) - value = literal_expr.cast(to) - result = con.execute(value) - raw = con.execute(literal_expr) - assert result == expected(raw) - - -def test_timestamp_with_timezone_is_inferred_correctly(t): - assert t.plain_datetimes_naive.type().equals(dt.timestamp) - assert t.plain_datetimes_ny.type().equals(dt.Timestamp("America/New_York")) - assert t.plain_datetimes_utc.type().equals(dt.Timestamp("UTC")) - - -@pytest.mark.parametrize( - "column", - ["plain_datetimes_naive", "plain_datetimes_ny", "plain_datetimes_utc"], -) -def test_cast_date(t, df, column): - expr = t[column].cast("date") - result = expr.execute() - expected = ( - df[column] - .dt.normalize() - .map(lambda x: x.date()) - .compute() - .rename(expr.get_name()) - ) - tm.assert_series_equal(result, expected, check_index=False) - - -@pytest.mark.parametrize("type", [dt.Decimal(9, 2), dt.Decimal(12, 3)]) -def test_cast_to_decimal(t, pandas_df, type): - expr = t.float64_as_strings.cast(type) - result = expr.execute() - context = decimal.Context(prec=type.precision) - expected = pandas_df.float64_as_strings.apply( - lambda x: context.create_decimal(x).quantize( - decimal.Decimal( - "{}.{}".format("0" * (type.precision - type.scale), "0" * type.scale) - ) - ) - ) - tm.assert_series_equal(result, expected, check_names=False) - assert all( - abs(element.as_tuple().exponent) == type.scale for element in result.values - ) - assert all( - 1 <= len(element.as_tuple().digits) <= type.precision - for element in result.values - ) diff --git a/ibis/backends/dask/tests/test_client.py b/ibis/backends/dask/tests/test_client.py deleted file mode 100644 index da9dbf1a82bb0..0000000000000 --- a/ibis/backends/dask/tests/test_client.py +++ /dev/null @@ -1,128 +0,0 @@ -from __future__ import annotations - -import re - -import dask.dataframe as dd -import numpy as np -import pandas as pd -import pytest -from dask.dataframe.utils import tm -from pytest import param - -import ibis -import ibis.expr.operations as ops - - -def make_dask_data_frame(npartitions): - df = pd.DataFrame(np.random.randn(30, 4), columns=list("ABCD")) - return dd.from_pandas(df, npartitions=npartitions) - - -@pytest.fixture -def client(npartitions): - return ibis.dask.connect( - { - "df": dd.from_pandas( - pd.DataFrame({"a": [1, 2, 3], "b": list("abc")}), - npartitions=npartitions, - ), - "df_unknown": dd.from_pandas( - pd.DataFrame({"array_of_strings": [["a", "b"], [], ["c"]]}), - npartitions=npartitions, - ), - } - ) - - -@pytest.fixture -def table(client): - return client.table("df") - - -def test_connect_no_args(): - con = ibis.dask.connect() - assert dict(con.tables) == {} - - -def test_client_table(table): - assert isinstance(table.op(), ops.DatabaseTable) - - -def test_create_table(client, npartitions): - ddf = make_dask_data_frame(npartitions) - client.create_table("testing", obj=ddf) - assert "testing" in client.list_tables() - client.create_table("testingschema", schema=client.get_schema("testing")) - assert "testingschema" in client.list_tables() - - -def test_literal(client): - lit = ibis.literal(1) - result = client.execute(lit) - assert result == 1 - - -def test_list_tables(client): - assert client.list_tables(like="df_unknown") - assert not client.list_tables(like="not_in_the_database") - assert client.list_tables() - - -def test_drop(table): - table = table.mutate(c=table.a) - expr = table.drop("a") - result = expr.execute() - expected = table[["b", "c"]].execute() - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "unit", - [ - "Y", - "M", - "D", - "h", - "m", - "s", - "ms", - "us", - "ns", - param("ps", marks=pytest.mark.xfail), - param("fs", marks=pytest.mark.xfail), - param("as", marks=pytest.mark.xfail), - ], -) -def test_datetime64_infer(client, unit): - value = np.datetime64("2018-01-02", unit) - expr = ibis.literal(value, type="timestamp") - result = client.execute(expr) - assert result == pd.Timestamp(value).to_pydatetime() - - -def test_invalid_connection_parameter_types(npartitions): - # Check that the user receives a TypeError with an informative message when - # passing invalid an connection parameter to the backend. - expected_msg = re.escape( - "Expected an instance of 'dask.dataframe.DataFrame' for 'invalid_str'," - " got an instance of 'str' instead." - ) - with pytest.raises(TypeError, match=expected_msg): - ibis.dask.connect( - { - "valid_dask_df": dd.from_pandas( - pd.DataFrame({"a": [1, 2, 3], "b": list("abc")}), - npartitions=npartitions, - ), - "valid_pandas_df": pd.DataFrame({"a": [1, 2, 3], "b": list("abc")}), - "invalid_str": "file.csv", - } - ) - - expected_msg = re.escape( - "Expected an instance of 'dask.dataframe.DataFrame' for 'df', " - "got an instance of 'str' instead." - ) - con = ibis.dask.connect() - with pytest.raises(TypeError, match=expected_msg): - con.from_dataframe("file.csv") diff --git a/ibis/backends/dask/tests/test_core.py b/ibis/backends/dask/tests/test_core.py deleted file mode 100644 index 6f480ff0d9211..0000000000000 --- a/ibis/backends/dask/tests/test_core.py +++ /dev/null @@ -1,39 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import pytest -from dask.dataframe.utils import tm - -import ibis - -dd = pytest.importorskip("dask.dataframe") - - -def test_table_from_dataframe(dataframe, ibis_table, con): - t = con.from_dataframe(dataframe) - result = t.execute() - expected = ibis_table.execute() - tm.assert_frame_equal(result, expected) - - t = con.from_dataframe(dataframe, name="foo") - expected = ibis_table.execute() - tm.assert_frame_equal(result, expected) - - t = con.from_dataframe(dataframe, name="foo", client=con) - expected = ibis_table.execute() - tm.assert_frame_equal(result, expected) - - -def test_array_literal_from_series(con): - values = [1, 2, 3, 4] - s = dd.from_pandas(pd.Series(values), npartitions=1) - expr = ibis.array(s) - - assert expr.equals(ibis.array(values)) - assert con.execute(expr) == pytest.approx([1, 2, 3, 4]) - - -def test_execute_parameter_only(con): - param = ibis.param("int64") - result = con.execute(param, params={param.op(): 42}) - assert result == 42 diff --git a/ibis/backends/dask/tests/test_functions.py b/ibis/backends/dask/tests/test_functions.py deleted file mode 100644 index 4f65abb48cb3c..0000000000000 --- a/ibis/backends/dask/tests/test_functions.py +++ /dev/null @@ -1,197 +0,0 @@ -from __future__ import annotations - -import decimal -import functools -import math -import operator -from operator import methodcaller - -import numpy as np -import pandas as pd -import pytest -from pytest import param - -import ibis -import ibis.expr.datatypes as dt -from ibis.backends.dask.tests.conftest import TestConf as tm - - -@pytest.mark.parametrize( - "op", - [ - # comparison - operator.eq, - operator.ne, - operator.lt, - operator.le, - operator.gt, - operator.ge, - ], -) -def test_binary_operations(t, df, op): - expr = op(t.plain_float64, t.plain_int64) - result = expr.execute() - expected = op(df.plain_float64, df.plain_int64).compute() - tm.assert_series_equal( - result.reset_index(drop=True).rename("tmp"), - expected.reset_index(drop=True).rename("tmp"), - ) - - -@pytest.mark.parametrize("op", [operator.and_, operator.or_, operator.xor]) -def test_binary_boolean_operations(t, pandas_df, op): - expr = op(t.plain_int64 == 1, t.plain_int64 == 2) - result = expr.execute() - expected = op(pandas_df.plain_int64 == 1, pandas_df.plain_int64 == 2) - tm.assert_series_equal( - result.reset_index(drop=True), - expected.reset_index(drop=True), - ) - - -def operate(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - return wrapper - - -@pytest.mark.parametrize( - ("ibis_func", "pandas_func"), - [ - param( - methodcaller("round", 2), - lambda x: x.quantize(decimal.Decimal(".00")), - id="round_2", - ), - param( - methodcaller("round", 0), - lambda x: x.quantize(decimal.Decimal("0.")), - id="round_0", - ), - param(methodcaller("ceil"), lambda x: decimal.Decimal(math.ceil(x)), id="ceil"), - param( - methodcaller("floor"), lambda x: decimal.Decimal(math.floor(x)), id="floor" - ), - param( - methodcaller("exp"), - methodcaller("exp"), - id="exp", - marks=pytest.mark.xfail( - reason="Unable to normalize Decimal('2.71513316E+43') as decimal with precision 12 and scale 3", - raises=TypeError, - ), - ), - param( - methodcaller("sign"), - lambda x: x if not x else decimal.Decimal(1).copy_sign(x), - id="sign", - ), - param(methodcaller("sqrt"), operate(lambda x: x.sqrt()), id="sqrt"), - param( - methodcaller("log", 2), - operate(lambda x: x.ln() / decimal.Decimal(2).ln()), - id="log_2", - ), - param(methodcaller("ln"), operate(lambda x: x.ln()), id="ln"), - param( - methodcaller("log2"), - operate(lambda x: x.ln() / decimal.Decimal(2).ln()), - id="log2", - ), - param(methodcaller("log10"), operate(lambda x: x.log10()), id="log10"), - ], -) -def test_math_functions_decimal(t, pandas_df, ibis_func, pandas_func): - dtype = dt.Decimal(12, 3) - context = decimal.Context(prec=dtype.precision) - p = decimal.Decimal(f"{'0' * (dtype.precision - dtype.scale)}.{'0' * dtype.scale}") - - def func(x): - x = context.create_decimal(x) - x = pandas_func(x) - if math.isnan(x): - return float("nan") - return x.quantize(p) - - expr = ibis_func(t.float64_as_strings.cast(dtype)) - result = expr.execute() - expected = pandas_df.float64_as_strings.map(func, na_action="ignore") - tm.assert_series_equal(result, expected, check_names=False) - - -def test_round_decimal_with_negative_places(t): - type = dt.Decimal(12, 3) - expr = t.float64_as_strings.cast(type).round(-1) - result = expr.execute() - expected = pd.Series( - list(map(decimal.Decimal, ["1.0E+2", "2.3E+2", "-1.00E+3"])), - name="float64_as_strings", - ) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - ("ibis_func", "dask_func"), - [ - ( - lambda x: x.quantile([0.25, 0.75]), - lambda x: list(x.quantile([0.25, 0.75])), - ) - ], -) -@pytest.mark.parametrize("column", ["float64_with_zeros", "int64_with_zeros"]) -def test_quantile_list(t, pandas_df, ibis_func, dask_func, column): - expr = ibis_func(t[column]) - result = expr.execute() - expected = dask_func(pandas_df[column]) - assert result == expected - - -@pytest.mark.parametrize( - ("ibis_func", "dask_func"), - [ - (lambda x: x.quantile(0), lambda x: x.quantile(0)), - (lambda x: x.quantile(1), lambda x: x.quantile(1)), - ( - lambda x: x.quantile(0.5), - lambda x: x.quantile(0.5), - ), - ], -) -def test_quantile_scalar(t, pandas_df, ibis_func, dask_func): - result = ibis_func(t.float64_with_zeros).execute() - expected = dask_func(pandas_df.float64_with_zeros) - assert result == expected - - result = ibis_func(t.int64_with_zeros).execute() - expected = dask_func(pandas_df.int64_with_zeros) - assert result == expected - - -@pytest.mark.parametrize( - ("ibis_func", "exc"), - [ - # no lower/upper specified - (lambda x: x.clip(), ValueError), - # out of range on quantile - (lambda x: x.quantile(5.0), ValueError), - ], -) -def test_arraylike_functions_transform_errors(t, df, ibis_func, exc): - with pytest.raises(exc): - ibis_func(t.float64_with_zeros).execute() - - -def test_ifelse_returning_bool(con): - one = ibis.literal(1) - two = ibis.literal(2) - true = ibis.literal(True) - false = ibis.literal(False) - expr = ibis.ifelse(one + one == two, true, false) - result = con.execute(expr) - assert result is np.bool_(True) diff --git a/ibis/backends/dask/tests/test_join.py b/ibis/backends/dask/tests/test_join.py deleted file mode 100644 index 75b1235d5182a..0000000000000 --- a/ibis/backends/dask/tests/test_join.py +++ /dev/null @@ -1,368 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import pytest -from pandas import date_range - -import ibis - -dd = pytest.importorskip("dask.dataframe") -from dask.dataframe.utils import tm # noqa: E402 - -# Note - computations in this file use the single threadsed scheduler (instead -# of the default multithreaded scheduler) in order to avoid a flaky interaction -# between dask and pandas in merges. There is evidence this has been fixed in -# pandas>=1.1.2 (or in other schedulers). For more background see: -# - https://github.com/dask/dask/issues/6454 -# - https://github.com/dask/dask/issues/5060 - - -join_type = pytest.mark.parametrize( - "how", - [ - "inner", - "left", - "right", - "outer", - ], -) - - -@join_type -def test_join(how, left, right, df1, df2): - expr = left.join(right, left.key == right.key, how=how)[ - left, right.other_value, right.key3 - ] - result = expr.compile() - expected = dd.merge(df1, df2, how=how, on="key") - tm.assert_frame_equal( - result[expected.columns].compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded").reset_index(drop=True), - ) - - -@join_type -def test_join_project_left_table(how, left, right, df1, df2): - expr = left.join(right, left.key == right.key, how=how)[left, right.key3] - result = expr.compile() - expected = dd.merge(df1, df2, how=how, on="key")[list(left.columns) + ["key3"]] - tm.assert_frame_equal( - result[expected.columns].compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded").reset_index(drop=True), - ) - - -@join_type -def test_join_with_invalid_predicates(how, left, right): - predicate = (left.key == right.key) & (left.key2 <= right.key3) - expr = left.join(right, predicate, how=how) - with pytest.raises(TypeError): - expr.compile() - - predicate = left.key >= right.key - expr = left.join(right, predicate, how=how) - with pytest.raises(TypeError): - expr.compile() - - -@join_type -@pytest.mark.xfail(reason="Hard to detect this case") -def test_join_with_duplicate_non_key_columns(how, left, right, df1, df2): - left = left.mutate(x=left.value * 2) - right = right.mutate(x=right.other_value * 3) - expr = left.join(right, left.key == right.key, how=how) - - # This is undefined behavior because `x` is duplicated. This is difficult - # to detect - with pytest.raises(ValueError): - expr.compile() - - -@join_type -def test_join_with_post_expression_selection(how, left, right, df1, df2): - join = left.join(right, left.key == right.key, how=how) - expr = join[left.key, left.value, right.other_value] - result = expr.compile() - expected = dd.merge(df1, df2, on="key", how=how)[["key", "value", "other_value"]] - tm.assert_frame_equal( - result[expected.columns].compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded").reset_index(drop=True), - ) - - -@join_type -def test_join_with_post_expression_filter(how, left): - lhs = left[["key", "key2"]] - rhs = left[["key2", "value"]] - - joined = lhs.join(rhs, "key2", how=how) - projected = joined[lhs, rhs.value] - expr = projected[projected.value == 4] - result = expr.compile() - - df1 = lhs.compile() - df2 = rhs.compile() - expected = dd.merge(df1, df2, on="key2", how=how) - expected = expected.loc[expected.value == 4].reset_index(drop=True) - - tm.assert_frame_equal( - result.compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded"), - ) - - -@join_type -def test_multi_join_with_post_expression_filter(how, left, df1): - lhs = left[["key", "key2"]] - rhs = left[["key2", "value"]] - rhs2 = left[["key2", "value"]].rename(value2="value") - - joined = lhs.join(rhs, "key2", how=how) - projected = joined[lhs, rhs.value] - filtered = projected[projected.value == 4] - - joined2 = filtered.join(rhs2, "key2") - projected2 = joined2[filtered.key, rhs2.value2] - expr = projected2[projected2.value2 == 3] - - result = expr.compile() - - df1 = lhs.compile() - df2 = rhs.compile() - df3 = rhs2.compile() - expected = dd.merge(df1, df2, on="key2", how=how) - expected = expected.loc[expected.value == 4].reset_index(drop=True) - expected = dd.merge(expected, df3, on="key2")[["key", "value2"]] - expected = expected.loc[expected.value2 == 3].reset_index(drop=True) - - tm.assert_frame_equal( - result.compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded"), - ) - - -@join_type -def test_join_with_non_trivial_key(how, left, right, df1, df2): - # also test that the order of operands in the predicate doesn't matter - join = left.join(right, right.key.length() == left.key.length(), how=how) - expr = join[left.key, left.value, right.other_value] - result = expr.compile() - - expected = ( - dd.merge( - df1.assign(key_len=df1.key.str.len()), - df2.assign(key_len=df2.key.str.len()), - on="key_len", - how=how, - ) - .drop(["key_len", "key_y", "key2", "key3"], axis=1) - .rename(columns={"key_x": "key"}) - ) - tm.assert_frame_equal( - result[expected.columns].compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded"), - ) - - -@join_type -def test_join_with_non_trivial_key_project_table(how, left, right, df1, df2): - # also test that the order of operands in the predicate doesn't matter - join = left.join(right, right.key.length() == left.key.length(), how=how) - expr = join[left, right.other_value] - expr = expr[expr.key.length() == 1] - result = expr.compile() - - expected = ( - dd.merge( - df1.assign(key_len=df1.key.str.len()), - df2.assign(key_len=df2.key.str.len()), - on="key_len", - how=how, - ) - .drop(["key_len", "key_y", "key2", "key3"], axis=1) - .rename(columns={"key_x": "key"}) - ) - expected = expected.loc[expected.key.str.len() == 1] - tm.assert_frame_equal( - result[expected.columns].compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded"), - ) - - -@join_type -def test_join_with_project_right_duplicate_column(client, how, left, df1, df3): - # also test that the order of operands in the predicate doesn't matter - right = client.table("df3") - join = left.join(right, ["key"], how=how) - expr = join[left.key, right.key2, right.other_value] - result = expr.compile() - - expected = ( - dd.merge(df1, df3, on="key", how=how) - .drop(["key2_x", "key3", "value"], axis=1) - .rename(columns={"key2_y": "key2"}) - ) - tm.assert_frame_equal( - result[expected.columns].compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded").reset_index(drop=True), - ) - - -merge_asof_minversion = pytest.mark.skipif( - pd.__version__ < "0.19.2", - reason="at least pandas-0.19.2 required for merge_asof", -) - - -@merge_asof_minversion -def test_asof_join(time_left, time_right, time_df1, time_df2): - expr = time_left.asof_join(time_right, "time")[time_left, time_right.other_value] - result = expr.compile() - expected = dd.merge_asof(time_df1, time_df2, on="time") - tm.assert_frame_equal( - result[expected.columns].compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded").reset_index(drop=True), - ) - - -@merge_asof_minversion -def test_keyed_asof_join( - time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 -): - expr = time_keyed_left.asof_join(time_keyed_right, "time", predicates="key")[ - time_keyed_left, time_keyed_right.other_value - ] - result = expr.compile() - expected = dd.merge_asof(time_keyed_df1, time_keyed_df2, on="time", by="key") - tm.assert_frame_equal( - result[expected.columns].compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded").reset_index(drop=True), - ) - - -@merge_asof_minversion -def test_asof_join_overlapping_non_predicate( - time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 -): - # Add a junk column with a colliding name - time_keyed_left = time_keyed_left.mutate( - collide=time_keyed_left.key + time_keyed_left.value - ) - time_keyed_right = time_keyed_right.mutate( - collide=time_keyed_right.key + time_keyed_right.other_value - ) - time_keyed_df1.assign(collide=time_keyed_df1["key"] + time_keyed_df1["value"]) - time_keyed_df2.assign(collide=time_keyed_df2["key"] + time_keyed_df2["other_value"]) - - expr = time_keyed_left.asof_join( - time_keyed_right, on="time", predicates=[("key", "key")] - ) - result = expr.compile() - expected = dd.merge_asof( - time_keyed_df1, time_keyed_df2, on="time", by="key", suffixes=("", "_right") - ) - tm.assert_frame_equal( - result[expected.columns].compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded").reset_index(drop=True), - ) - - -@pytest.mark.parametrize( - "how", - [ - "left", - "right", - "inner", - "outer", - ], -) -@pytest.mark.parametrize( - "func", - [ - pytest.param(lambda join: join["a0", "a1"], id="tuple"), - pytest.param(lambda join: join[["a0", "a1"]], id="list"), - pytest.param(lambda join: join.select(["a0", "a1"]), id="select"), - ], -) -def test_select_on_unambiguous_join(con, how, func): - df_t = pd.DataFrame({"a0": [1, 2, 3], "b1": list("aab")}) - df_s = pd.DataFrame({"a1": [2, 3, 4], "b2": list("abc")}) - - t = ibis.memtable(df_t) - s = ibis.memtable(df_s) - method = getattr(t, f"{how}_join") - join = method(s, t.b1 == s.b2) - expr = func(join) - result = con.compile(expr).compute(scheduler="single-threaded") - - expected = pd.merge(df_t, df_s, left_on=["b1"], right_on=["b2"], how=how)[ - ["a0", "a1"] - ] - assert not expected.empty - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "func", - [ - pytest.param(lambda join: join["a0", "a1"], id="tuple"), - pytest.param(lambda join: join[["a0", "a1"]], id="list"), - pytest.param(lambda join: join.select(["a0", "a1"]), id="select"), - ], -) -@merge_asof_minversion -def test_select_on_unambiguous_asof_join(func, npartitions): - df_t = dd.from_pandas( - pd.DataFrame({"a0": [1, 2, 3], "b1": date_range("20180101", periods=3)}), - npartitions=npartitions, - ) - df_s = dd.from_pandas( - pd.DataFrame({"a1": [2, 3, 4], "b2": date_range("20171230", periods=3)}), - npartitions=npartitions, - ) - con = ibis.dask.connect({"t": df_t, "s": df_s}) - t = con.table("t") - s = con.table("s") - join = t.asof_join(s, t.b1 == s.b2) - expected = dd.merge_asof(df_t, df_s, left_on=["b1"], right_on=["b2"])[["a0", "a1"]] - assert not expected.compute(scheduler="single-threaded").empty - expr = func(join) - result = expr.compile() - tm.assert_frame_equal( - result.compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded").reset_index(drop=True), - ) - - -def test_outer_join(npartitions): - df = dd.from_pandas( - pd.DataFrame({"test": [1, 2, 3], "name": ["a", "b", "c"]}), - npartitions=npartitions, - ) - df_2 = dd.from_pandas( - pd.DataFrame({"test_2": [1, 5, 6], "name_2": ["d", "e", "f"]}), - npartitions=npartitions, - ) - - conn = ibis.dask.connect({"df": df, "df_2": df_2}) - - ibis_table_1 = conn.table("df") - ibis_table_2 = conn.table("df_2") - - joined = ibis_table_1.outer_join( - ibis_table_2, - predicates=ibis_table_1["test"] == ibis_table_2["test_2"], - ) - result = joined.compile() - expected = dd.merge( - df, - df_2, - left_on="test", - right_on="test_2", - how="outer", - ) - tm.assert_frame_equal( - result.compute(scheduler="single-threaded"), - expected.compute(scheduler="single-threaded").reset_index(drop=True), - ) diff --git a/ibis/backends/dask/tests/test_maps.py b/ibis/backends/dask/tests/test_maps.py deleted file mode 100644 index b7445434211dd..0000000000000 --- a/ibis/backends/dask/tests/test_maps.py +++ /dev/null @@ -1,90 +0,0 @@ -from __future__ import annotations - -import numpy as np -import pandas as pd -import pytest - -import ibis - -dd = pytest.importorskip("dask.dataframe") -from dask.dataframe.utils import tm # noqa: E402 - - -def test_map_length_expr(t): - expr = t.map_of_integers_strings.length() - result = expr.execute() - expected = pd.Series([0, None, 2], name="MapLength(map_of_integers_strings)") - tm.assert_series_equal(result, expected, check_index=False) - - -def test_map_value_for_key_expr(t): - expr = t.map_of_integers_strings[1] - result = expr.execute() - expected = pd.Series( - [None, None, "a"], name="MapGet(map_of_integers_strings, 1, None)" - ) - tm.assert_series_equal(result, expected, check_index=False) - - -def test_map_value_or_default_for_key_expr(t): - expr = t.map_of_complex_values.get("a") - result = expr.execute() - expected = pd.Series( - [None, [1, 2, 3], None], - dtype="object", - name=expr.get_name(), - ) - tm.assert_series_equal(result, expected, check_index=False) - - -def safe_sorter(element): - return sorted(element) if isinstance(element, list) else element - - -def test_map_keys_expr(t): - expr = t.map_of_strings_integers.keys() - result = expr.execute().map(safe_sorter) - expected = pd.Series( - [["a", "b"], None, []], - dtype="object", - name="MapKeys(map_of_strings_integers)", - ).map(safe_sorter) - tm.assert_series_equal(result, expected, check_index=False) - - -def test_map_values_expr(t): - expr = t.map_of_complex_values.values() - result = expr.execute() - expected = pd.Series( - [ - None, - np.array([[1, 2, 3], []], dtype="object"), - np.array([], dtype="object"), - ], - dtype="object", - name="MapValues(map_of_complex_values)", - ) - tm.assert_series_equal(result, expected, check_index=False) - - -def test_map_concat_expr(t): - expr = t.map_of_complex_values + {"b": [4, 5, 6], "c": [], "a": []} - result = expr.execute() - expected = pd.Series( - [ - None, - {"a": [], "b": [4, 5, 6], "c": []}, - {"b": [4, 5, 6], "c": [], "a": []}, - ], - dtype="object", - name=expr.get_name(), - ) - tm.assert_series_equal(result, expected, check_index=False) - - -def test_map_value_for_key_literal_broadcast(t): - lookup_table = ibis.literal({"a": 1, "b": 2, "c": 3, "d": 4}) - expr = lookup_table.get(t.dup_strings) - result = expr.execute() - expected = pd.Series([4, 1, 4], dtype="int8", name=expr.get_name()) - tm.assert_series_equal(result, expected, check_index=False) diff --git a/ibis/backends/dask/tests/test_operations.py b/ibis/backends/dask/tests/test_operations.py deleted file mode 100644 index cf6bd9a9eb040..0000000000000 --- a/ibis/backends/dask/tests/test_operations.py +++ /dev/null @@ -1,847 +0,0 @@ -from __future__ import annotations - -import operator -from operator import methodcaller - -import numpy as np -import numpy.testing as npt -import pandas as pd -import pytest -from packaging.version import parse as vparse -from pytest import param - -import ibis -import ibis.expr.datatypes as dt - -dask = pytest.importorskip("dask") -da = pytest.importorskip("dask.array") -dd = pytest.importorskip("dask.dataframe") - -from dask.dataframe.utils import tm # noqa: E402 - - -def test_table_column(t, pandas_df): - expr = t.plain_int64 - result = expr.execute() - expected = pandas_df.plain_int64 - tm.assert_series_equal(result, expected) - - -def test_literal(client): - assert client.execute(ibis.literal(1)) == 1 - - -def test_selection(t, df): - expr = t[((t.plain_strings == "a") | (t.plain_int64 == 3)) & (t.dup_strings == "d")] - result = expr.compile() - expected = df[ - ((df.plain_strings == "a") | (df.plain_int64 == 3)) & (df.dup_strings == "d") - ] - tm.assert_frame_equal( - result[expected.columns].compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -def test_mutate(t, df): - expr = t.mutate(x=t.plain_int64 + 1, y=t.plain_int64 * 2) - result = expr.compile() - expected = df.assign(x=df.plain_int64 + 1, y=df.plain_int64 * 2) - tm.assert_frame_equal( - result[expected.columns].compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -@pytest.mark.xfail(reason="TODO - windowing - #2553") -def test_project_scope_does_not_override(t, df): - col = t.plain_int64 - expr = t[ - [ - col.name("new_col"), - col.sum().over(ibis.window(group_by="dup_strings")).name("grouped"), - ] - ] - result = expr.compile() - expected = dd.concat( - [ - df[["plain_int64", "dup_strings"]].rename( - columns={"plain_int64": "new_col"} - ), - df.groupby("dup_strings") - .plain_int64.transform("sum") - .reset_index(drop=True) - .rename("grouped"), - ], - axis=1, - )[["new_col", "grouped"]] - tm.assert_frame_equal( - result.compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -@pytest.mark.parametrize( - "where", - [ - param(lambda _: None, id="none"), - param(lambda t: t.dup_strings == "d", id="simple"), - param(lambda t: (t.dup_strings == "d") | (t.plain_int64 < 100), id="complex"), - ], -) -@pytest.mark.parametrize( - ("ibis_func", "pandas_func"), - [ - param(methodcaller("abs"), np.abs, id="abs"), - param(methodcaller("ceil"), np.ceil, id="ceil"), - param(methodcaller("exp"), np.exp, id="exp"), - param(methodcaller("floor"), np.floor, id="floor"), - param(methodcaller("ln"), np.log, id="log"), - param(methodcaller("log10"), np.log10, id="log10"), - param(methodcaller("log", 2), lambda x: np.log(x) / np.log(2), id="logb"), - param(methodcaller("log2"), np.log2, id="log2"), - param( - methodcaller("round", 0), lambda x: x.round(0).astype("int64"), id="round0" - ), - param(methodcaller("round", -2), methodcaller("round", -2), id="roundm2"), - param(methodcaller("round", 2), methodcaller("round", 2), id="round2"), - param(methodcaller("round"), lambda x: x.round().astype("int64"), id="round"), - param(methodcaller("sign"), np.sign, id="sign"), - param(methodcaller("sqrt"), np.sqrt, id="sqrt"), - ], -) -def test_aggregation_group_by(t, pandas_df, where, ibis_func, pandas_func): - ibis_where = where(t) - expr = t.group_by(t.dup_strings).aggregate( - avg_plain_int64=t.plain_int64.mean(where=ibis_where), - sum_plain_float64=t.plain_float64.sum(where=ibis_where), - mean_float64_positive=ibis_func(t.float64_positive).mean(where=ibis_where), - neg_mean_int64_with_zeros=(-t.int64_with_zeros).mean(where=ibis_where), - nunique_dup_ints=t.dup_ints.nunique(), - ) - result = expr.execute() - - df = pandas_df - pandas_where = where(df) - mask = slice(None) if pandas_where is None else pandas_where - expected = ( - df.groupby("dup_strings") - .agg( - { - "plain_int64": lambda x, mask=mask: x[mask].mean(), - "plain_float64": lambda x, mask=mask: x[mask].sum(), - "dup_ints": "nunique", - "float64_positive": ( - lambda x, mask=mask, func=pandas_func: func(x[mask]).mean() - ), - "int64_with_zeros": lambda x, mask=mask: (-x[mask]).mean(), - } - ) - .reset_index() - .rename( - columns={ - "plain_int64": "avg_plain_int64", - "plain_float64": "sum_plain_float64", - "dup_ints": "nunique_dup_ints", - "float64_positive": "mean_float64_positive", - "int64_with_zeros": "neg_mean_int64_with_zeros", - } - ) - ) - lhs = result[expected.columns] - rhs = expected - tm.assert_frame_equal(lhs, rhs) - - -def test_aggregation_without_group_by(t, df): - expr = t.aggregate( - avg_plain_int64=t.plain_int64.mean(), - sum_plain_float64=t.plain_float64.sum(), - ) - result = expr.compile()[["avg_plain_int64", "sum_plain_float64"]] - new_names = { - "plain_float64": "sum_plain_float64", - "plain_int64": "avg_plain_int64", - } - pandas_df = df.compute().reset_index(drop=True) - expected = ( - pd.Series( - [ - pandas_df["plain_int64"].mean(), - pandas_df["plain_float64"].sum(), - ], - index=["plain_int64", "plain_float64"], - ) - .to_frame() - .T.rename(columns=new_names) - ) - lhs = result[expected.columns].compute().reset_index(drop=True) - tm.assert_frame_equal(lhs, expected) - - -def test_group_by_with_having(t, df): - expr = ( - t.group_by(t.dup_strings) - .having(t.plain_float64.sum() == 5) - .aggregate(avg_a=t.plain_int64.mean(), sum_c=t.plain_float64.sum()) - ) - result = expr.compile() - - expected = ( - df.groupby("dup_strings") - .agg({"plain_int64": "mean", "plain_float64": "sum"}) - .reset_index() - .rename(columns={"plain_int64": "avg_a", "plain_float64": "sum_c"}) - ) - expected = expected.loc[expected.sum_c == 5, ["avg_a", "sum_c"]] - - tm.assert_frame_equal( - result[expected.columns].compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -def test_group_by_rename_key(t, df): - expr = t.group_by(t.dup_strings.name("foo")).aggregate( - dup_string_count=t.dup_strings.count() - ) - - assert "foo" in expr.schema() - result = expr.compile() - assert "foo" in result.columns - - expected = ( - df.groupby("dup_strings") - .dup_strings.count() - .rename("dup_string_count") - .reset_index() - .rename(columns={"dup_strings": "foo"}) - ) - tm.assert_frame_equal( - result.compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -@pytest.mark.parametrize("reduction", ["mean", "sum", "count", "std", "var"]) -@pytest.mark.parametrize( - "where", - [ - lambda t: (t.plain_strings == "a") | (t.plain_strings == "c"), - lambda t: (t.dup_strings == "d") - & ((t.plain_int64 == 1) | (t.plain_int64 == 3)), - lambda t: None, - ], -) -def test_reduction(t, pandas_df, reduction, where): - func = getattr(t.plain_int64, reduction) - mask = where(t) - expr = func(where=mask) - result = expr.execute() - - df_mask = where(pandas_df) - expected_func = getattr( - pandas_df.loc[df_mask if df_mask is not None else slice(None), "plain_int64"], - reduction, - ) - expected = expected_func() - assert result == expected - - -@pytest.mark.parametrize( - "where", - [ - lambda t: (t.plain_strings == "a") | (t.plain_strings == "c"), - lambda t: None, - ], -) -def test_grouped_reduction(t, df, where): - ibis_where = where(t) - expr = t.group_by(t.dup_strings).aggregate( - nunique_dup_ints=t.dup_ints.nunique(), - sum_plain_int64=t.plain_int64.sum(where=ibis_where), - mean_plain_int64=t.plain_int64.mean(where=ibis_where), - count_plain_int64=t.plain_int64.count(where=ibis_where), - std_plain_int64=t.plain_int64.std(where=ibis_where), - var_plain_int64=t.plain_int64.var(where=ibis_where), - nunique_plain_int64=t.plain_int64.nunique(where=ibis_where), - ) - result = expr.compile() - - df_mask = where(df.compute()) - mask = slice(None) if df_mask is None else df_mask - - expected = ( - df.compute() - .groupby("dup_strings") - .agg( - { - "dup_ints": "nunique", - "plain_int64": [ - lambda x, mask=mask: x[mask].sum(), - lambda x, mask=mask: x[mask].mean(), - lambda x, mask=mask: x[mask].count(), - lambda x, mask=mask: x[mask].std(), - lambda x, mask=mask: x[mask].var(), - lambda x, mask=mask: x[mask].nunique(), - ], - } - ) - .reset_index() - ) - result = result.compute() - - assert len(result.columns) == len(expected.columns) - - expected.columns = [ - "dup_strings", - "nunique_dup_ints", - "sum_plain_int64", - "mean_plain_int64", - "count_plain_int64", - "std_plain_int64", - "var_plain_int64", - "nunique_plain_int64", - ] - # guarantee ordering - result = result[expected.columns] - # dask and pandas differ slightly in how they treat groups with no entry - # we're not testing that so fillna here. - result = result.fillna(0.0) - expected = expected.fillna(0.0) - - # match the dtypes - if df_mask is None: - expected["mean_plain_int64"] = expected.mean_plain_int64.astype("float64") - else: - expected["sum_plain_int64"] = expected.sum_plain_int64.astype("int64") - expected["count_plain_int64"] = expected.count_plain_int64.astype("int64") - expected["nunique_plain_int64"] = expected.nunique_plain_int64.astype("int64") - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "reduction", - [ - lambda x: x.any(), - lambda x: x.all(), - lambda x: ~(x.any()), - lambda x: ~(x.all()), - ], -) -def test_boolean_aggregation(t, pandas_df, reduction): - expr = reduction(t.plain_int64 == 1) - result = expr.execute() - expected = reduction(pandas_df.plain_int64 == 1) - assert result == expected - - -@pytest.mark.parametrize("column", ["float64_with_zeros", "int64_with_zeros"]) -def test_nullif_zero(t, pandas_df, column): - expr = t[column].nullif(0) - result = expr.execute() - expected = pandas_df[column].replace(0, np.nan) - tm.assert_series_equal(result, expected, check_index=False, check_names=False) - - -@pytest.mark.parametrize( - ("left", "right", "expected", "compare"), - [ - param( - lambda t: ibis.literal(1), - lambda t: ibis.literal(1), - lambda df: np.nan, - np.testing.assert_array_equal, # treats NaNs as equal - id="literal_literal_equal", - ), - param( - lambda t: ibis.literal(1), - lambda t: ibis.literal(2), - lambda df: 1, - np.testing.assert_equal, - id="literal_literal_not_equal", - ), - param( - lambda t: t.dup_strings, - lambda t: ibis.literal("a"), - lambda df: df.dup_strings.where(df.dup_strings != "a"), - tm.assert_series_equal, - id="series_literal", - ), - param( - lambda t: t.dup_strings, - lambda t: t.dup_strings, - lambda df: df.dup_strings.where(df.dup_strings != df.dup_strings), - tm.assert_series_equal, - id="series_series", - ), - param( - lambda t: ibis.literal("a"), - lambda t: t.dup_strings, - lambda _: pd.Series(["a", np.nan, "a"], name="dup_strings"), - tm.assert_series_equal, - id="literal_series", - ), - ], -) -def test_nullif(t, con, pandas_df, left, right, expected, compare): - expr = left(t).nullif(right(t)) - result = con.execute(expr.name("dup_strings")) - compare(result, expected(pandas_df)) - - -def test_nullif_inf(con): - df = pd.DataFrame({"a": [np.inf, 3.14, -np.inf, 42.0]}) - t = ibis.memtable(df) - expr = t.a.nullif(np.inf).nullif(-np.inf) - result = con.execute(expr) - expected = pd.Series([np.nan, 3.14, np.nan, 42.0], name="a") - tm.assert_series_equal(result, expected, check_names=False) - - -def test_group_concat(t, df): - expr = ( - t[t.dup_ints == 1] - .group_by(t.dup_strings) - .aggregate(foo=t.dup_ints.group_concat(",")) - ) - result = expr.compile() - expected = ( - df[df.dup_ints == 1] - .groupby("dup_strings") - .apply(lambda df: ",".join(df.dup_ints.astype(str))) - .reset_index() - .rename(columns={0: "foo"}) - ) - - left = ( - result[expected.columns] - .compute() - .sort_values("dup_strings") - .reset_index(drop=True) - ) - right = expected.compute().sort_values("dup_strings").reset_index(drop=True) - tm.assert_frame_equal(left, right) - - -@pytest.mark.parametrize("offset", [0, 2]) -def test_frame_limit(t, df, offset): - n = 5 - df_expr = t.limit(n, offset=offset) - result = df_expr.compile() - expected = df.loc[offset : offset + n].reset_index(drop=True) - tm.assert_frame_equal( - result[expected.columns].compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -@pytest.mark.xfail(raises=AttributeError, reason="TableColumn does not implement limit") -@pytest.mark.parametrize("offset", [0, 2]) -def test_series_limit(t, df, offset): - n = 5 - s_expr = t.plain_int64.limit(n, offset=offset) - result = s_expr.compile() - tm.assert_series_equal( - result, df.plain_int64.iloc[offset : offset + n], check_index=False - ) - - -@pytest.mark.xfail( - condition=vparse(dask.__version__) < vparse("2024.2.0"), - reason="not implemented until 2024.2.0", -) -def test_complex_order_by(t, df): - expr = t.order_by([ibis.desc(t.plain_int64 * t.plain_float64), t.plain_float64]) - result = expr.compile() - expected = ( - df.assign(foo=df.plain_int64 * df.plain_float64) - .sort_values(["foo", "plain_float64"], ascending=[False, True]) - .drop(["foo"], axis=1) - .reset_index(drop=True) - ) - - tm.assert_frame_equal( - result[expected.columns].compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -def test_count_distinct(t, pandas_df): - expr = t.dup_strings.nunique() - result = expr.execute() - expected = pandas_df.dup_strings.nunique() - assert result == expected - - -def test_value_counts(t, df): - expr = t.dup_strings.value_counts() - result = expr.compile() - expected = ( - df.compute() - .dup_strings.value_counts() - .rename("dup_strings") - .reset_index(name="dup_strings_count") - .rename(columns={"index": "dup_strings"}) - .sort_values(["dup_strings"]) - .reset_index(drop=True) - ) - tm.assert_frame_equal( - result[expected.columns].compute().reset_index(drop=True), expected - ) - - -def test_table_count(t, df): - expr = t.count() - result = expr.execute() - expected = len(df) - assert result == expected - - -def test_weighted_average(t, df): - expr = t.group_by(t.dup_strings).aggregate( - avg=(t.plain_float64 * t.plain_int64).sum() / t.plain_int64.sum() - ) - result = expr.compile() - expected = ( - df.groupby("dup_strings") - .apply( - lambda df: (df.plain_int64 * df.plain_float64).sum() / df.plain_int64.sum() - ) - .reset_index() - .rename(columns={0: "avg"}) - ) - tm.assert_frame_equal( - result[expected.columns].compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -def test_group_by_multiple_keys(t, df): - expr = t.group_by([t.dup_strings, t.dup_ints]).aggregate( - avg_plain_float64=t.plain_float64.mean() - ) - result = expr.compile() - expected = ( - df.groupby(["dup_strings", "dup_ints"]) - .agg({"plain_float64": "mean"}) - .reset_index() - .rename(columns={"plain_float64": "avg_plain_float64"}) - ) - tm.assert_frame_equal( - result[expected.columns] - .compute() - .sort_values(["dup_strings", "dup_ints"]) - .reset_index(drop=True), - expected.compute() - .sort_values(["dup_strings", "dup_ints"]) - .reset_index(drop=True), - ) - - -def test_mutate_after_group_by(t, df): - gb = t.group_by(t.dup_strings).aggregate(avg_plain_float64=t.plain_float64.mean()) - expr = gb.mutate(x=gb.avg_plain_float64) - result = expr.compile() - expected = ( - df.groupby("dup_strings") - .agg({"plain_float64": "mean"}) - .reset_index() - .rename(columns={"plain_float64": "avg_plain_float64"}) - ) - expected = expected.assign(x=expected.avg_plain_float64) - tm.assert_frame_equal( - result[expected.columns] - .compute() - .sort_values("dup_strings") - .reset_index(drop=True), - expected.compute().sort_values("dup_strings").reset_index(drop=True), - ) - - -def test_group_by_with_unnamed_arithmetic(t, df): - expr = t.group_by(t.dup_strings).aggregate( - naive_variance=((t.plain_float64**2).sum() - t.plain_float64.mean() ** 2) - / t.plain_float64.count() - ) - result = expr.compile() - expected = ( - df.compute() - .groupby("dup_strings") - .agg({"plain_float64": lambda x: ((x**2).sum() - x.mean() ** 2) / x.count()}) - .reset_index() - .rename(columns={"plain_float64": "naive_variance"}) - ) - tm.assert_frame_equal( - result[expected.columns].compute().reset_index(drop=True), expected - ) - - -def test_isnull(t, pandas_df): - expr = t.strings_with_nulls.isnull() - result = expr.execute() - expected = pandas_df.strings_with_nulls.isnull() - tm.assert_series_equal(result, expected, check_index=False, check_names=False) - - -def test_notnull(t, pandas_df): - expr = t.strings_with_nulls.notnull() - result = expr.execute() - expected = pandas_df.strings_with_nulls.notnull() - tm.assert_series_equal(result, expected, check_names=False) - - -@pytest.mark.parametrize("raw_value", [0.0, 1.0]) -def test_scalar_parameter(t, pandas_df, raw_value): - value = ibis.param(dt.double) - expr = t.float64_with_zeros == value - result = expr.execute(params={value: raw_value}) - expected = pandas_df.float64_with_zeros == raw_value - tm.assert_series_equal(result, expected, check_names=False) - - -@pytest.mark.parametrize("elements", [[1], (1,), {1}, frozenset({1})]) -def test_isin(t, pandas_df, elements): - expr = t.plain_float64.isin(elements) - expected = pandas_df.plain_float64.isin(elements) - result = expr.execute() - tm.assert_series_equal(result, expected, check_names=False) - - -@pytest.mark.parametrize("elements", [[1], (1,), {1}, frozenset({1})]) -def test_notin(t, pandas_df, elements): - expr = t.plain_float64.notin(elements) - expected = ~pandas_df.plain_float64.isin(elements) - result = expr.execute() - tm.assert_series_equal(result, expected, check_index=False, check_names=False) - - -def test_cast_on_group_by(t, df): - expr = t.group_by(t.dup_strings).aggregate( - casted=(t.float64_with_zeros == 0).cast("int64").sum() - ) - - result = expr.compile() - expected = ( - df.groupby("dup_strings") - .float64_with_zeros.apply(lambda s: (s == 0).astype("int64").sum()) - .reset_index() - .rename(columns={"float64_with_zeros": "casted"}) - ) - tm.assert_frame_equal( - result.compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -@pytest.mark.parametrize( - "op", - [ - operator.add, - operator.mul, - operator.sub, - operator.truediv, - operator.floordiv, - operator.mod, - operator.pow, - ], - ids=operator.attrgetter("__name__"), -) -@pytest.mark.parametrize("args", [lambda c: (1.0, c), lambda c: (c, 1.0)]) -def test_left_binary_op(t, pandas_df, op, args): - expr = op(*args(t.float64_with_zeros)) - result = expr.execute() - expected = op(*args(pandas_df.float64_with_zeros)).astype(result.dtype) - tm.assert_series_equal(result, expected, check_index=False, check_names=False) - - -@pytest.mark.parametrize( - "op", - [ - operator.add, - operator.mul, - operator.sub, - operator.truediv, - operator.floordiv, - operator.mod, - operator.pow, - ], - ids=operator.attrgetter("__name__"), -) -@pytest.mark.parametrize("argfunc", [lambda c: (1.0, c), lambda c: (c, 1.0)]) -def test_left_binary_op_gb(t, pandas_df, op, argfunc): - expr = t.group_by("dup_strings").aggregate( - foo=op(*argfunc(t.float64_with_zeros)).sum() - ) - result = expr.execute() - expected = ( - pandas_df.groupby("dup_strings") - .float64_with_zeros.apply(lambda s: op(*argfunc(s)).sum()) - .reset_index() - .rename(columns={"float64_with_zeros": "foo"}) - ) - expected["foo"] = expected["foo"].astype(result["foo"].dtype) - tm.assert_frame_equal(result, expected, check_names=False) - - -@pytest.mark.parametrize( - "left_f", - [ - param(lambda e: e - 1, id="sub"), - param(lambda _: 0.0, id="zero"), - param(lambda _: None, id="none"), - ], -) -@pytest.mark.parametrize( - "right_f", - [ - param(lambda e: e + 1, id="add"), - param(lambda _: 1.0, id="one"), - param(lambda _: None, id="none"), - ], -) -def test_ifelse_series(t, pandas_df, left_f, right_f): - col_expr = t["plain_int64"] - result = ibis.ifelse( - col_expr > col_expr.mean(), left_f(col_expr), right_f(col_expr) - ).execute() - - series = pandas_df["plain_int64"] - cond = series > series.mean() - left = left_f(series) - if not isinstance(left, pd.Series): - left = pd.Series(np.repeat(left, len(cond)), name=cond.name) - expected = left.where(cond, right_f(series)) - - tm.assert_series_equal( - result.astype(object).fillna(pd.NA), - expected.astype(object).fillna(pd.NA), - check_dtype=False, - check_names=False, - ) - - -@pytest.mark.parametrize( - ("cond", "expected_func"), - [ - param(True, lambda df: df["plain_int64"].astype("float64"), id="true"), - param(False, lambda df: pd.Series(np.repeat(3.0, len(df))), id="false"), - ], -) -def test_ifelse_scalar(t, pandas_df, cond, expected_func): - expr = ibis.ifelse(cond, t["plain_int64"], 3.0) - result = expr.execute() - expected = expected_func(pandas_df) - tm.assert_series_equal(result, expected, check_names=False) - - -def test_ifelse_long(batting, batting_pandas_df): - col_expr = batting["AB"] - result = ibis.ifelse(col_expr > col_expr.mean(), col_expr, 0.0).execute() - - series = batting_pandas_df["AB"] - expected = series.where(series > series.mean(), other=0.0).astype("float64") - - tm.assert_series_equal(result, expected, check_names=False) - - -def test_round(t, pandas_df): - precision = 2 - mult = 3.33333 - result = (t.count() * mult).round(precision).execute() - expected = np.around(len(pandas_df) * mult, precision) - npt.assert_almost_equal(result, expected, decimal=precision) - - -def test_quantile_group_by(batting, batting_pandas_df): - def q_fun(x, quantile): - res = x.quantile(quantile).tolist() - return [res for _ in range(len(x))] - - frac = 0.2 - result = ( - batting.group_by("teamID") - .mutate(res=lambda x: x.RBI.quantile([frac, 1 - frac])) - .res.execute() - ) - expected = ( - batting_pandas_df.groupby("teamID") - .RBI.transform(q_fun, quantile=[frac, 1 - frac]) - .rename("res") - ) - tm.assert_series_equal(result, expected, check_index=False) - - -def test_table_distinct(t, df): - expr = t[["dup_strings"]].distinct() - result = expr.compile() - expected = df[["dup_strings"]].drop_duplicates() - tm.assert_frame_equal( - result.compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -@pytest.mark.parametrize("distinct", [True, False]) -def test_union(client, df1, distinct): - t = client.table("df1") - expr = t.union(t, distinct=distinct) - result = expr.compile() - expected = df1 if distinct else dd.concat([df1, df1], axis=0, ignore_index=True) - - # match indices because of dask reset_index behavior - result = result.compute().reset_index(drop=True) - expected = expected.compute().reset_index(drop=True) - - tm.assert_frame_equal(result, expected) - - -def test_intersect(client, df1, intersect_df2): - t1 = client.table("df1") - t2 = client.table("intersect_df2") - expr = t1.intersect(t2) - result = expr.compile() - expected = df1.merge(intersect_df2, on=list(df1.columns)) - tm.assert_frame_equal( - result.compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) - - -def test_difference(client, df1, intersect_df2): - t1 = client.table("df1") - t2 = client.table("intersect_df2") - expr = t1.difference(t2) - result = expr.compile() - merged = df1.merge(intersect_df2, on=list(df1.columns), how="outer", indicator=True) - expected = merged[merged["_merge"] != "both"].drop("_merge", axis=1) - - # force same index - result = result.compute().reset_index(drop=True) - expected = expected.compute().reset_index(drop=True) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "distinct", - [ - param( - True, - marks=pytest.mark.xfail( - raises=TypeError, - reason="dask cannot compute the distinct element of an array column", - ), - ), - False, - ], -) -def test_union_with_list_types(t, df, distinct): - expr = t.union(t, distinct=distinct) - result = expr.compile() - expected = df if distinct else dd.concat([df, df], axis=0, ignore_index=True) - tm.assert_frame_equal( - result.compute().reset_index(drop=True), - expected.compute().reset_index(drop=True), - ) diff --git a/ibis/backends/dask/tests/test_strings.py b/ibis/backends/dask/tests/test_strings.py deleted file mode 100644 index dfd70e291d3e2..0000000000000 --- a/ibis/backends/dask/tests/test_strings.py +++ /dev/null @@ -1,116 +0,0 @@ -from __future__ import annotations - -from warnings import catch_warnings - -import pytest -from pytest import param - -dd = pytest.importorskip("dask.dataframe") -from dask.dataframe.utils import tm # noqa: E402 - - -@pytest.mark.parametrize( - ("case_func", "expected_func"), - [ - param( - lambda s: s.length(), - lambda s: s.str.len().astype("int32"), - id="length", - ), - param(lambda s: s.substr(1, 2), lambda s: s.str[1:3], id="substr"), - param(lambda s: s[1:3], lambda s: s.str[1:3], id="slice"), - # TODO - execute_substring_series_series is broken - param( - lambda s: s[s.length() - 1 :], - lambda s: s.str[-1:], - id="expr_slice_begin", - ), - param( - lambda s: s[: s.length()], - lambda s: s, - id="expr_slice_end", - ), - param( - lambda s: s[s.length() - 2 : s.length() - 1], - lambda s: s.str[-2:-1], - id="expr_slice_begin_end", - ), - param(lambda s: s.strip(), lambda s: s.str.strip(), id="strip"), - param(lambda s: s.lstrip(), lambda s: s.str.lstrip(), id="lstrip"), - param(lambda s: s.rstrip(), lambda s: s.str.rstrip(), id="rstrip"), - param( - lambda s: s.lpad(3, "a"), - lambda s: s.str.pad(3, side="left", fillchar="a"), - id="lpad", - ), - param( - lambda s: s.rpad(3, "b"), - lambda s: s.str.pad(3, side="right", fillchar="b"), - id="rpad", - ), - param(lambda s: s.reverse(), lambda s: s.str[::-1], id="reverse"), - param(lambda s: s.lower(), lambda s: s.str.lower(), id="lower"), - param(lambda s: s.upper(), lambda s: s.str.upper(), id="upper"), - param(lambda s: s.repeat(2), lambda s: s * 2, id="repeat"), - param( - lambda s: s.contains("a"), - lambda s: s.str.contains("a", regex=False), - id="contains", - ), - param( - lambda s: ~(s.contains("a")), - lambda s: ~s.str.contains("a", regex=False), - id="not_contains", - ), - param( - lambda s: s.like("a"), - lambda s: s.str.contains("^a$", regex=True), - id="like", - ), - param( - lambda s: s.re_search("(ab)+"), - lambda s: s.str.contains("(?:ab)+", regex=True), - id="re_search", - ), - param( - lambda s: s.re_search("(ab)+") | s.re_search("d{1,2}ee"), - lambda s: ( - s.str.contains("(?:ab)+", regex=True) | s.str.contains("d{1,2}ee") - ), - id="re_search_or", - ), - param( - lambda s: s + s.rpad(3, "a"), - lambda s: s + s.str.pad(3, side="right", fillchar="a"), - id="rpad2", - ), - param( - lambda s: s.split(" "), - lambda s: s.str.split(" "), - id="split_spaces", - ), - ], -) -def test_string_ops(t, df, case_func, expected_func): - # ignore matching UserWarnings - with catch_warnings(record=True): - expr = case_func(t.strings_with_space) - result = expr.name("result").execute() - series = expected_func(df.strings_with_space).rename("result").compute() - tm.assert_series_equal(result, series, check_index=False) - - -def test_grouped_string_re_search(t, df): - expr = t.group_by(t.dup_strings).aggregate( - sum=t.strings_with_space.re_search("(ab)+").cast("int64").sum() - ) - - result = expr.compile() - expected = ( - df.groupby("dup_strings") - .strings_with_space.apply(lambda s: s.str.contains("(?:ab)+", regex=True).sum()) - .reset_index() - .rename(columns={"strings_with_space": "sum"}) - ) - - tm.assert_frame_equal(result.compute(), expected.compute()) diff --git a/ibis/backends/dask/tests/test_structs.py b/ibis/backends/dask/tests/test_structs.py deleted file mode 100644 index c92f2b72c49d8..0000000000000 --- a/ibis/backends/dask/tests/test_structs.py +++ /dev/null @@ -1,96 +0,0 @@ -from __future__ import annotations - -from collections import OrderedDict - -import pandas as pd -import pytest - -import ibis -import ibis.expr.datatypes as dt - -dd = pytest.importorskip("dask.dataframe") - -from dask.dataframe.utils import tm # noqa: E402 - - -@pytest.fixture(scope="module") -def value(): - return OrderedDict([("fruit", "pear"), ("weight", 0)]) - - -@pytest.fixture(scope="module") -def struct_client(value, npartitions): - df = dd.from_pandas( - pd.DataFrame( - { - "s": [ - OrderedDict([("fruit", "apple"), ("weight", None)]), - value, - OrderedDict([("fruit", "pear"), ("weight", 1)]), - ], - "key": list("aab"), - "value": [1, 2, 3], - } - ), - npartitions=npartitions, - ) - return ibis.dask.connect({"t": df}) - - -@pytest.fixture -def struct_table(struct_client): - return struct_client.table( - "t", - schema={ - "s": dt.Struct.from_tuples([("fruit", dt.string), ("weight", dt.int8)]) - }, - ) - - -def test_struct_field_literal(value, con): - struct = ibis.literal(value) - assert struct.type() == dt.Struct.from_tuples( - [("fruit", dt.string), ("weight", dt.int8)] - ) - - expr = struct["fruit"] - result = con.execute(expr) - assert result == "pear" - - expr = struct["weight"] - result = con.execute(expr) - assert result == 0 - - -def test_struct_field_series(struct_table): - t = struct_table - expr = t.s["fruit"] - result = expr.execute() - expected = pd.Series(["apple", "pear", "pear"], name="fruit") - - tm.assert_series_equal(result, expected, check_index=False) - - -def test_struct_field_series_group_by_key(struct_table): - t = struct_table - expr = t.group_by(t.s["fruit"]).aggregate(total=t.value.sum()) - result = expr.execute() - expected = pd.DataFrame([("apple", 1), ("pear", 5)], columns=["fruit", "total"]) - - tm.assert_frame_equal( - result.reset_index(drop=True), expected.reset_index(drop=True) - ) - - -def test_struct_field_series_group_by_value(struct_table): - t = struct_table - expr = t.group_by(t.key).aggregate(total=t.s["weight"].sum()) - result = expr.execute() - # these are floats because we have a NULL value in the input data - expected = pd.DataFrame([("a", 0.0), ("b", 1.0)], columns=["key", "total"]) - tm.assert_frame_equal( - result, - expected.assign( - total=lambda df: df.total.astype(expr.total.type().to_pandas()) - ), - ) diff --git a/ibis/backends/dask/tests/test_temporal.py b/ibis/backends/dask/tests/test_temporal.py deleted file mode 100644 index a70bd37005f0b..0000000000000 --- a/ibis/backends/dask/tests/test_temporal.py +++ /dev/null @@ -1,213 +0,0 @@ -from __future__ import annotations - -import datetime -from operator import methodcaller - -import numpy as np -import pandas as pd -import pytest -from packaging.version import parse as parse_version -from pytest import param - -import ibis -from ibis import literal as L -from ibis.expr import datatypes as dt - -dd = pytest.importorskip("dask.dataframe") -from dask.dataframe.utils import tm # noqa: E402 - - -@pytest.mark.parametrize( - ("case_func", "expected_func"), - [ - (lambda v: v.strftime("%Y%m%d"), lambda vt: vt.strftime("%Y%m%d")), - (lambda v: v.year(), lambda vt: vt.year), - (lambda v: v.month(), lambda vt: vt.month), - (lambda v: v.day(), lambda vt: vt.day), - (lambda v: v.hour(), lambda vt: vt.hour), - (lambda v: v.minute(), lambda vt: vt.minute), - (lambda v: v.second(), lambda vt: vt.second), - (lambda v: v.millisecond(), lambda vt: int(vt.microsecond / 1e3)), - ] - + [ - (methodcaller("strftime", pattern), methodcaller("strftime", pattern)) - for pattern in [ - "%Y%m%d %H", - 'DD BAR %w FOO "DD"', - 'DD BAR %w FOO "D', - 'DD BAR "%w" FOO "D', - 'DD BAR "%d" FOO "D', - 'DD BAR "%c" FOO "D', - 'DD BAR "%x" FOO "D', - 'DD BAR "%X" FOO "D', - ] - ], -) -def test_timestamp_functions(con, case_func, expected_func): - v = L("2015-09-01 14:48:05.359").cast("timestamp") - vt = datetime.datetime( - year=2015, - month=9, - day=1, - hour=14, - minute=48, - second=5, - microsecond=359000, - ) - result = case_func(v) - expected = expected_func(vt) - assert con.execute(result) == expected - - -@pytest.mark.parametrize( - "column", - ["datetime_strings_naive", "datetime_strings_ny", "datetime_strings_utc"], -) -def test_cast_datetime_strings_to_date(t, df, column): - expr = t[column].cast("date") - result = expr.execute() - df_computed = df.compute() - expected = pd.to_datetime(df_computed[column]).map(lambda x: x.date()) - - tm.assert_series_equal( - result.reset_index(drop=True).rename("tmp"), - expected.reset_index(drop=True).rename("tmp"), - ) - - -@pytest.mark.parametrize( - "column", - ["datetime_strings_naive", "datetime_strings_ny", "datetime_strings_utc"], -) -def test_cast_datetime_strings_to_timestamp(t, pandas_df, column): - expr = t[column].cast(dt.Timestamp(scale=9)) - result = expr.execute() - expected = pd.to_datetime(pandas_df[column]) - if getattr(expected.dtype, "tz", None) is not None: - expected = expected.dt.tz_convert(None) - tm.assert_series_equal(result, expected, check_names=False) - - -@pytest.mark.parametrize( - "column", - ["plain_datetimes_naive", "plain_datetimes_ny", "plain_datetimes_utc"], -) -def test_cast_integer_to_temporal_type(t, df, pandas_df, column): - column_type = t[column].type() - expr = t.plain_int64.cast(column_type) - result = expr.execute() - - expected = pd.Series( - pd.to_datetime(pandas_df.plain_int64.values, unit="s").values, - index=pandas_df.index, - name="plain_int64", - ).dt.tz_localize(column_type.timezone) - - tm.assert_series_equal( - result.reset_index(drop=True), - expected.reset_index(drop=True), - check_names=False, - ) - - -def test_cast_integer_to_date(t, pandas_df): - expr = t.plain_int64.cast("date") - result = expr.execute() - expected = pd.Series( - pd.to_datetime(pandas_df.plain_int64.values, unit="D").date, - index=pandas_df.index, - name="plain_int64", - ) - tm.assert_series_equal(result, expected, check_names=False) - - -def test_times_ops(t, df): - result = t.plain_datetimes_naive.time().between("10:00", "10:00").execute() - expected = pd.Series(np.zeros(len(df), dtype=bool)) - tm.assert_series_equal( - result.reset_index(drop=True), - expected.reset_index(drop=True), - check_names=False, - ) - - result = t.plain_datetimes_naive.time().between("01:00", "02:00").execute() - expected = pd.Series(np.ones(len(df), dtype=bool)) - tm.assert_series_equal( - result.reset_index(drop=True), - expected.reset_index(drop=True), - check_names=False, - ) - - -@pytest.mark.parametrize( - ("tz", "rconstruct", "column"), - [ - ("US/Eastern", np.ones, "plain_datetimes_utc"), - ("US/Eastern", np.zeros, "plain_datetimes_naive"), - ("UTC", np.ones, "plain_datetimes_utc"), - ("UTC", np.ones, "plain_datetimes_naive"), - (None, np.ones, "plain_datetimes_utc"), - (None, np.ones, "plain_datetimes_naive"), - ], - ids=lambda x: str(getattr(x, "__name__", x)).lower().replace("/", "_"), -) -def test_times_ops_with_tz(t, df, tz, rconstruct, column): - expected = dd.from_array(rconstruct(len(df), dtype=bool)) - time = t[column].time() - expr = time.between("01:00", "02:00", timezone=tz) - result = expr.execute() - tm.assert_series_equal( - result.reset_index(drop=True), - expected.compute().reset_index(drop=True), - check_names=False, - ) - - # Test that casting behavior is the same as using the timezone kwarg - ts = t[column].cast(dt.Timestamp(timezone=tz)) - expr = ts.time().between("01:00", "02:00") - result = expr.execute() - tm.assert_series_equal( - result.reset_index(drop=True), - expected.compute().reset_index(drop=True), - check_names=False, - ) - - -@pytest.mark.parametrize( - ("op", "expected"), - [ - param(lambda x, y: x + y, lambda x, y: x.values * 2, id="add"), - param(lambda x, y: x - y, lambda x, y: x.values - y.values, id="sub"), - param(lambda x, y: x * 2, lambda x, y: x.values * 2, id="mul"), - param( - lambda x, y: x // 2, - lambda x, y: x.values // 2, - id="floordiv", - marks=pytest.mark.xfail( - parse_version(pd.__version__) < parse_version("0.23.0"), - raises=TypeError, - reason=( - "pandas versions less than 0.23.0 do not support floor " - "division involving timedelta columns" - ), - ), - ), - ], -) -def test_interval_arithmetic(op, expected): - data = pd.timedelta_range("0 days", "10 days", freq="D") - pandas_df = pd.DataFrame({"td": data}) - con = ibis.dask.connect( - { - "df1": dd.from_pandas(pandas_df, npartitions=1), - "df2": dd.from_pandas(pandas_df, npartitions=1), - } - ) - t1 = con.table("df1") - expr = op(t1.td, t1.td) - result = expr.execute() - expected = pd.Series(expected(data, data), name=expr.get_name()) - - tm.assert_series_equal( - result.reset_index(drop=True), expected.reset_index(drop=True) - ) diff --git a/ibis/backends/dask/tests/test_udf.py b/ibis/backends/dask/tests/test_udf.py deleted file mode 100644 index 97974c155718e..0000000000000 --- a/ibis/backends/dask/tests/test_udf.py +++ /dev/null @@ -1,436 +0,0 @@ -from __future__ import annotations - -import numpy as np -import pandas as pd -import pandas.testing as tm -import pytest - -import ibis -import ibis.expr.datatypes as dt -import ibis.expr.types as ir -from ibis.legacy.udf.vectorized import analytic, elementwise, reduction - -dd = pytest.importorskip("dask.dataframe") - - -@pytest.fixture -def df(npartitions): - return dd.from_pandas( - pd.DataFrame( - { - "a": list("abc"), - "b": [1, 2, 3], - "c": [4.0, 5.0, 6.0], - "key": list("aab"), - } - ), - npartitions=npartitions, - ) - - -@pytest.fixture -def df2(npartitions): - # df with some randomness - return dd.from_pandas( - pd.DataFrame( - { - "a": np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), - "b": np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), - "c": np.arange(7, dtype=int).tolist(), - "d": list("aaaaddd"), - "key": list("ddeefff"), - } - ), - npartitions=npartitions, - ) - - -@pytest.fixture -def df_timestamp(npartitions): - df = pd.DataFrame( - { - "a": list(range(10)), - "b": list("wwwwwxxxxx"), - "c": list("yyyzzzyyzz"), - } - ) - df["a"] = df.a.astype(pd.DatetimeTZDtype(tz="UTC")) - return dd.from_pandas( - df, - npartitions=npartitions, - ) - - -@pytest.fixture -def con(df, df2, df_timestamp): - return ibis.dask.connect({"df": df, "df2": df2, "df_timestamp": df_timestamp}) - - -@pytest.fixture -def t(con): - return con.table("df") - - -@pytest.fixture -def t2(con): - return con.table("df2") - - -@pytest.fixture -def t_timestamp(con): - return con.table("df_timestamp") - - -# ------------- -# UDF Functions -# ------------- - -with pytest.warns(FutureWarning, match="v9.0"): - - @elementwise(input_type=["string"], output_type="int64") - def my_string_length(series, **kwargs): - return series.str.len() * 2 - - @elementwise(input_type=[dt.double, dt.double], output_type=dt.double) - def my_add(series1, series2, **kwargs): - return series1 + series2 - - @reduction(["double"], "double") - def my_mean(series): - return series.mean() - - @reduction( - input_type=[dt.Timestamp(timezone="UTC")], - output_type=dt.Timestamp(timezone="UTC"), - ) - def my_tz_min(series): - return series.min() - - @elementwise( - input_type=[dt.Timestamp(timezone="UTC")], - output_type=dt.Timestamp(timezone="UTC"), - ) - def my_tz_add_one(series): - return series + pd.Timedelta(1, unit="D") - - @reduction(input_type=[dt.string], output_type=dt.int64) - def my_string_length_sum(series, **kwargs): - return (series.str.len() * 2).sum() - - @reduction(input_type=[dt.double, dt.double], output_type=dt.double) - def my_corr(lhs, rhs, **kwargs): - return lhs.corr(rhs) - - @elementwise([dt.double], dt.double) - def add_one(x): - return x + 1.0 - - @elementwise([dt.double], dt.double) - def times_two(x): - return x * 2.0 - - @analytic(input_type=["double"], output_type="double") - def zscore(series): - return (series - series.mean()) / series.std() - - @reduction( - input_type=[dt.double], - output_type=dt.Array(dt.double), - ) - def collect(series): - return list(series) - - -# ----- -# Tests -# ----- - - -def test_udf(t, df): - expr = my_string_length(t.a) - - assert isinstance(expr, ir.Column) - - result = expr.execute() - expected = df.a.str.len().mul(2).compute() - - tm.assert_series_equal(result, expected, check_names=False, check_index=False) - - -def test_multiple_argument_udf(t, df): - expr = my_add(t.b, t.c) - - assert isinstance(expr, ir.Column) - assert isinstance(expr, ir.NumericColumn) - assert isinstance(expr, ir.FloatingColumn) - - result = expr.execute() - expected = (df.b + df.c).compute() - tm.assert_series_equal(result, expected, check_index=False, check_names=False) - - -def test_multiple_argument_udf_group_by(t): - expr = t.group_by(t.key).aggregate(my_add=my_add(t.b, t.c).sum()) - - assert isinstance(expr, ir.Table) - assert isinstance(expr.my_add, ir.Column) - assert isinstance(expr.my_add, ir.NumericColumn) - assert isinstance(expr.my_add, ir.FloatingColumn) - - result = expr.execute() - expected = pd.DataFrame( - {"key": list("ab"), "my_add": [sum([1.0 + 4.0, 2.0 + 5.0]), 3.0 + 6.0]} - ) - tm.assert_frame_equal( - result.reset_index(drop=True), expected.reset_index(drop=True) - ) - - -def test_udaf(t): - expr = my_string_length_sum(t.a) - - assert isinstance(expr, ir.Scalar) - - result = expr.execute() - expected = t.a.execute().str.len().mul(2).sum() - assert result == expected - - -def test_udaf_analytic_tzcol(t_timestamp, df_timestamp): - expr = my_tz_min(t_timestamp.a) - - result = expr.execute() - - expected = my_tz_min.func(df_timestamp.a.compute()) - assert result == expected - - -def test_udaf_elementwise_tzcol(t_timestamp, df_timestamp): - expr = my_tz_add_one(t_timestamp.a) - - result = expr.execute().reset_index(drop=True) - - expected = my_tz_add_one.func(df_timestamp.a.compute()) - tm.assert_series_equal(result, expected, check_index=False, check_names=False) - - -def test_udaf_analytic(t, df): - expr = zscore(t.c) - - assert isinstance(expr, ir.Column) - - result = expr.execute() - - def f(s): - return s.sub(s.mean()).div(s.std()) - - expected = (f(df.c)).compute() - tm.assert_series_equal(result, expected, check_index=False, check_names=False) - - -def test_udaf_analytic_group_by(t, df): - expr = zscore(t.c).over(ibis.window(group_by=t.key)) - - assert isinstance(expr, ir.Column) - - result = expr.execute() - - def f(s): - return s.sub(s.mean()).div(s.std()) - - expected = df.groupby("key").c.transform(f).compute() - # We don't check names here because the udf is used "directly". - # We could potentially special case this and set the name directly - # if the udf is only being run on one column. - tm.assert_series_equal( - result.sort_index(), expected.sort_index(), check_names=False, check_index=False - ) - - -def test_udaf_group_by(t2, df2): - expr = t2.group_by(t2.key).aggregate(my_corr=my_corr(t2.a, t2.b)) - - result = expr.execute().sort_values("key").reset_index(drop=True) - - dfi = df2.set_index("key").compute() - expected = pd.DataFrame( - { - "key": list("def"), - "my_corr": [ - dfi.loc[value, "a"].corr(dfi.loc[value, "b"]) for value in "def" - ], - } - ) - - tm.assert_frame_equal( - result.reset_index(drop=True), expected.reset_index(drop=True) - ) - - -def test_udaf_group_by_multikey(t2, df2): - expr = t2.group_by([t2.key, t2.d]).aggregate(my_corr=my_corr(t2.a, t2.b)) - - result = expr.execute().sort_values("key").reset_index(drop=True) - - dfi = df2.set_index("key").compute() - expected = pd.DataFrame( - { - "key": list("def"), - "d": list("aad"), - "my_corr": [ - dfi.loc[value, "a"].corr(dfi.loc[value, "b"]) for value in "def" - ], - } - ) - tm.assert_frame_equal( - result.reset_index(drop=True), expected.reset_index(drop=True) - ) - - -def test_udaf_group_by_multikey_tzcol(t_timestamp, df_timestamp): - expr = t_timestamp.group_by([t_timestamp.b, t_timestamp.c]).aggregate( - my_min_time=my_tz_min(t_timestamp.a) - ) - - result = expr.execute().sort_values("b").reset_index(drop=True) - expected = ( - df_timestamp.groupby(["b", "c"]) - .min() - .reset_index() - .rename(columns={"a": "my_min_time"}) - .compute() - ) - tm.assert_frame_equal( - result.reset_index(drop=True), expected.reset_index(drop=True) - ) - - -def test_compose_udfs(t2, df2): - expr = times_two(add_one(t2.a)) - result = expr.execute().reset_index(drop=True) - expected = df2.a.add(1.0).mul(2.0).compute() - tm.assert_series_equal(result, expected, check_names=False, check_index=False) - - -def test_udaf_window(t2, df2): - window = ibis.trailing_window(2, order_by="a", group_by="key") - expr = t2.mutate(rolled=my_mean(t2.b).over(window)) - result = expr.execute().sort_values(["key", "a"]) - expected = ( - df2.compute() - .sort_values(["key", "a"]) - .assign( - rolled=lambda df: df.groupby("key") - .b.rolling(3, min_periods=1) - .mean() - .reset_index(level=0, drop=True) - ) - ) - tm.assert_frame_equal(result, expected) - - -def test_array_return_type_reduction(t, df): - """Tests reduction UDF returning an array.""" - expr = collect(t.b) - result = expr.execute() - expected = df.b.compute().tolist() - assert list(result) == expected - - -def test_array_return_type_reduction_window(t, df): - """Tests reduction UDF returning an array, used over a window.""" - expr = collect(t.b).over(ibis.window()) - result = expr.execute() - expected_raw = df.b.compute().tolist() - expected = pd.Series([expected_raw] * len(df)) - tm.assert_series_equal(result, expected, check_index=False, check_names=False) - - -def test_array_return_type_reduction_group_by(t, df): - """Tests reduction UDF returning an array, used in a grouped agg.""" - expr = t.group_by(t.key).aggregate(quantiles_col=collect(t.b)) - result = expr.execute() - - df = df.compute() # Convert to Pandas - expected_col = df.groupby(df.key).b.agg(lambda s: s.tolist()) - expected = pd.DataFrame({"quantiles_col": expected_col}).reset_index() - - tm.assert_frame_equal( - result.sort_values("key").reset_index(drop=True), - expected.sort_values("key").reset_index(drop=True), - ) - - -def test_elementwise_udf_with_many_args(t2): - with pytest.warns(FutureWarning, match="v9.0"): - - @elementwise( - input_type=[dt.double] * 16 + [dt.int32] * 8, output_type=dt.double - ) - def my_udf( - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - c10, - c11, - c12, - c13, - c14, - c15, - c16, - c17, - c18, - c19, - c20, - c21, - c22, - c23, - c24, - ): - return c1 - - expr = my_udf(*([t2.a] * 8 + [t2.b] * 8 + [t2.c] * 8)) - result = expr.execute() - expected = t2.a.execute() - - tm.assert_series_equal(result, expected, check_names=False, check_index=False) - - -# ----------------- -# Test raised errors -# ----------------- - - -def test_udaf_parameter_mismatch(): - with pytest.raises(TypeError): - with pytest.warns(FutureWarning, match="v9.0"): - - @reduction(input_type=[dt.double], output_type=dt.double) - def my_corr(lhs, rhs, **kwargs): - pass - - -def test_udf_parameter_mismatch(): - with pytest.raises(TypeError): - with pytest.warns(FutureWarning, match="v9.0"): - - @reduction(input_type=[], output_type=dt.double) - def my_corr2(lhs, **kwargs): - pass - - -def test_udf_error(t): - with pytest.warns(FutureWarning, match="v9.0"): - - @elementwise(input_type=[dt.double], output_type=dt.double) - def error_udf(s): - raise ValueError("xxx") - - with pytest.raises(ValueError): - error_udf(t.c).execute() diff --git a/ibis/backends/dask/tests/test_window.py b/ibis/backends/dask/tests/test_window.py deleted file mode 100644 index 2a6d17c67e136..0000000000000 --- a/ibis/backends/dask/tests/test_window.py +++ /dev/null @@ -1,526 +0,0 @@ -from __future__ import annotations - -from datetime import date -from operator import methodcaller - -import dask.dataframe as dd -import numpy as np -import pandas as pd -import pytest -from dask.dataframe.utils import tm - -import ibis -import ibis.expr.datatypes as dt -from ibis.backends.dask import Backend -from ibis.legacy.udf.vectorized import reduction - - -@pytest.fixture(scope="session") -def sort_kind(): - return "mergesort" - - -default = pytest.mark.parametrize("default", [ibis.null(), ibis.literal("a")]) -row_offset = pytest.mark.parametrize("row_offset", list(map(ibis.literal, [-1, 1, 0]))) -range_offset = pytest.mark.parametrize( - "range_offset", - [ - ibis.interval(days=1), - 2 * ibis.interval(days=1), - -2 * ibis.interval(days=1), - ], -) - - -@pytest.fixture -def row_window(): - return ibis.window(following=0, order_by="plain_int64") - - -@pytest.fixture -def range_window(): - return ibis.window(following=0, order_by="plain_datetimes_naive") - - -@default -@row_offset -def test_lead(con, t, df, row_offset, default, row_window): - expr = t.dup_strings.lead(row_offset, default=default).over(row_window) - result = expr.execute() - expected = df.dup_strings.shift(con.execute(-row_offset)).compute() - if default is not ibis.null(): - expected = expected.fillna(con.execute(default)) - tm.assert_series_equal(result, expected, check_names=False) - - -@default -@row_offset -def test_lag(con, t, df, row_offset, default, row_window): - expr = t.dup_strings.lag(row_offset, default=default).over(row_window) - result = expr.execute() - expected = df.dup_strings.shift(con.execute(row_offset)).compute() - if default is not ibis.null(): - expected = expected.fillna(con.execute(default)) - tm.assert_series_equal(result, expected, check_names=False) - - -@default -@range_offset -def test_lead_delta(con, t, pandas_df, range_offset, default, range_window): - expr = t.dup_strings.lead(range_offset, default=default).over(range_window) - result = expr.execute() - - expected = ( - pandas_df[["plain_datetimes_naive", "dup_strings"]] - .set_index("plain_datetimes_naive") - .squeeze() - .shift(freq=con.execute(-range_offset)) - .reindex(pandas_df.plain_datetimes_naive) - .reset_index(drop=True) - ) - if default is not ibis.null(): - expected = expected.fillna(con.execute(default)) - tm.assert_series_equal(result, expected, check_names=False) - - -@default -@range_offset -@pytest.mark.filterwarnings("ignore:Non-vectorized") -def test_lag_delta(t, con, pandas_df, range_offset, default, range_window): - expr = t.dup_strings.lag(range_offset, default=default).over(range_window) - result = expr.execute() - - expected = ( - pandas_df[["plain_datetimes_naive", "dup_strings"]] - .set_index("plain_datetimes_naive") - .squeeze() - .shift(freq=con.execute(range_offset)) - .reindex(pandas_df.plain_datetimes_naive) - .reset_index(drop=True) - ) - if default is not ibis.null(): - expected = expected.fillna(con.execute(default)) - tm.assert_series_equal(result, expected, check_names=False) - - -@pytest.mark.xfail(reason="Flaky test because of Dask #10034", strict=False) -def test_groupby_first(t, df): - gb = t.group_by(t.dup_strings) - expr = gb.mutate(first_value=t.plain_int64.first()) - result = expr.execute() - - df = df.compute() - gb = df.groupby("dup_strings") - df = df.reset_index(drop=True) - - expected = df.assign( - first_value=gb.plain_int64.transform("first"), - ).reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - -# FIXME dask issue with non deterministic groupby results. -# The issue relates to the shuffle method on a local cluster, using npartitions=1 in tests avoids it. -# https://github.com/dask/dask/issues/10034 -@pytest.mark.skip(reason="dask #10034") -def test_group_by_mutate_analytic(t, df): - gb = t.group_by(t.dup_strings) - expr = gb.mutate( - first_value=t.plain_int64.first(), - last_value=t.plain_strings.last(), - avg_broadcast=t.plain_float64 - t.plain_float64.mean(), - delta=(t.plain_int64 - t.plain_int64.lag()) - / (t.plain_float64 - t.plain_float64.lag()), - ) - result = expr.execute() - - df = df.compute() - gb = df.groupby("dup_strings") - df = df.reset_index(drop=True) - expected = df.assign( - first_value=gb.plain_int64.transform("first"), - last_value=gb.plain_strings.transform("last"), - avg_broadcast=df.plain_float64 - gb.plain_float64.transform("mean"), - delta=( - (df.plain_int64 - gb.plain_int64.shift(1)) - / (df.plain_float64 - gb.plain_float64.shift(1)) - ), - ).reset_index(drop=True) - - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_players(players, players_df): - lagged = players.mutate(pct=lambda t: t.G - t.G.lag()) - expected = players_df.assign( - pct=players_df.G - players_df.groupby("playerID").G.shift(1) - ) - cols = expected.columns.tolist() - result = lagged.execute()[cols].sort_values(cols).reset_index(drop=True) - expected = expected.sort_values(cols).reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - -def test_batting_filter_mean(batting, batting_df): - expr = batting[batting.G > batting.G.mean()] - result = expr.execute() - expected = ( - batting_df[batting_df.G > batting_df.G.mean()].reset_index(drop=True).compute() - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_batting_zscore(players, players_df): - expr = players.mutate(g_z=lambda t: (t.G - t.G.mean()) / t.G.std()) - - gb = players_df.groupby("playerID") - expected = players_df.assign( - g_z=(players_df.G - gb.G.transform("mean")) / gb.G.transform("std") - ) - cols = expected.columns.tolist() - result = expr.execute()[cols].sort_values(cols).reset_index(drop=True) - expected = expected.sort_values(cols).reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - -def test_batting_avg_change_in_games_per_year(players, players_df): - expr = players.mutate( - delta=lambda t: (t.G - t.G.lag()) / (t.yearID - t.yearID.lag()) - ) - - gb = players_df.groupby("playerID") - expected = players_df.assign( - delta=(players_df.G - gb.G.shift(1)) / (players_df.yearID - gb.yearID.shift(1)) - ) - - cols = expected.columns.tolist() - result = expr.execute()[cols].sort_values(cols).reset_index(drop=True) - expected = expected.sort_values(cols).reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("op", ["sum", "mean", "min", "max"]) -def test_batting_specific_cumulative(batting, batting_pandas_df, op, sort_kind): - ibis_method = methodcaller(f"cum{op}", order_by=batting.yearID) - expr = ibis_method(batting.G).name("tmp") - result = expr.execute().astype("float64") - - pandas_method = methodcaller(op) - expected = pandas_method( - batting_pandas_df[["G", "yearID"]] - .sort_values("yearID", kind=sort_kind) - .G.expanding() - ).reset_index(drop=True) - tm.assert_series_equal(result, expected.rename("tmp")) - - -def test_batting_cumulative(batting, batting_pandas_df, sort_kind): - expr = batting.mutate( - more_values=lambda t: t.G.sum().over(ibis.cumulative_window(order_by=t.yearID)) - ) - result = expr.execute() - - columns = ["G", "yearID"] - more_values = ( - batting_pandas_df[columns] - .sort_values("yearID", kind=sort_kind) - .G.expanding() - .sum() - .astype("int64") - ) - expected = batting_pandas_df.assign(more_values=more_values) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_batting_cumulative_partitioned(batting, batting_pandas_df, sort_kind): - group_by = "playerID" - order_by = "yearID" - - t = batting - expr = t.G.sum().over(ibis.cumulative_window(order_by=order_by, group_by=group_by)) - expr = t.mutate(cumulative=expr) - result = expr.execute() - - columns = [group_by, order_by, "G"] - expected = ( - batting_pandas_df[columns] - .set_index(order_by) - .groupby(group_by) - .G.expanding() - .sum() - .rename("cumulative") - ) - - tm.assert_series_equal( - result.set_index([group_by, order_by]).sort_index().cumulative, - expected.sort_index().astype("int64"), - ) - - -def test_batting_rolling(batting, batting_pandas_df, sort_kind): - expr = batting.mutate( - more_values=lambda t: t.G.sum().over(ibis.trailing_window(5, order_by=t.yearID)) - ) - result = expr.execute() - - columns = ["G", "yearID"] - more_values = ( - batting_pandas_df[columns] - .sort_values("yearID", kind=sort_kind) - .G.rolling(6, min_periods=1) - .sum() - .astype("int64") - ) - expected = batting_pandas_df.assign(more_values=more_values) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_batting_rolling_partitioned(batting, batting_pandas_df, sort_kind): - t = batting - group_by = "playerID" - order_by = "yearID" - expr = t.G.sum().over( - ibis.trailing_window(3, order_by=t[order_by], group_by=t[group_by]) - ) - expr = t.mutate(rolled=expr) - result = expr.execute() - - columns = [group_by, order_by, "G"] - expected = ( - batting_pandas_df[columns] - .set_index(order_by) - .groupby(group_by) - .G.rolling(4, min_periods=1) - .sum() - .rename("rolled") - ) - - tm.assert_series_equal( - result.set_index([group_by, order_by]).sort_index().rolled, - expected.sort_index().astype("int64"), - ) - - -@pytest.mark.parametrize( - "window", - [ - pytest.param( - ibis.window(order_by="yearID"), - marks=pytest.mark.xfail(reason="Cumulative windows not supported"), - ), - pytest.param( - ibis.window(order_by="yearID", group_by="playerID"), - marks=pytest.mark.xfail(reason="Group and order by not implemented"), - ), - ], -) -def test_window_failure_mode(batting, batting_df, window): - # can't have order by without a following value of 0 - expr = batting.mutate(more_values=batting.G.sum().over(window)) - with pytest.raises(ibis.common.exceptions.OperationNotDefinedError): - expr.execute() - - -def test_scalar_broadcasting(batting, batting_df): - expr = batting.mutate(demeaned=batting.G - batting.G.mean()) - result = expr.execute() - expected = batting_df.assign(demeaned=batting_df.G - batting_df.G.mean()) - expected = expected.compute() - - tm.assert_frame_equal(result, expected) - - -def test_mutate_with_window_after_join(con, sort_kind): - left_df = pd.DataFrame( - { - "ints": [0, 1, 2], - "strings": ["a", "b", "c"], - "dates": pd.date_range("20170101", periods=3), - } - ) - right_df = pd.DataFrame( - { - "group": [0, 1, 2] * 3, - "value": [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8], - } - ) - - left = ibis.memtable(left_df) - right = ibis.memtable(right_df) - - joined = left.outer_join(right, left.ints == right.group) - proj = joined[left, right.value] - expr = proj.group_by("ints").mutate(sum=proj.value.sum()) - result = con.execute(expr) - expected = pd.DataFrame( - { - "dates": pd.concat([left_df.dates] * 3) - .sort_values(kind=sort_kind) - .reset_index(drop=True), - "ints": [0] * 3 + [1] * 3 + [2] * 3, - "strings": ["a"] * 3 + ["b"] * 3 + ["c"] * 3, - "value": [0.0, 3.0, 6.0, 1.0, 4.0, 7.0, np.nan, np.nan, 8.0], - "sum": [9.0] * 3 + [12.0] * 3 + [8.0] * 3, - } - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_mutate_scalar_with_window_after_join(npartitions): - left_df = dd.from_pandas(pd.DataFrame({"ints": range(3)}), npartitions=npartitions) - right_df = dd.from_pandas( - pd.DataFrame( - { - "group": [0, 1, 2] * 3, - "value": [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8], - } - ), - npartitions=npartitions, - ) - con = Backend().connect({"left": left_df, "right": right_df}) - left, right = map(con.table, ("left", "right")) - - joined = left.outer_join(right, left.ints == right.group) - proj = joined[left, right.value] - expr = proj.mutate(sum=proj.value.sum(), const=ibis.literal(1)) - result = expr.execute() - result = result.sort_values(["ints", "value"]).reset_index(drop=True) - expected = ( - pd.DataFrame( - { - "ints": [0] * 3 + [1] * 3 + [2] * 3, - "value": [0.0, 3.0, 6.0, 1.0, 4.0, 7.0, np.nan, np.nan, 8.0], - "sum": [29.0] * 9, - "const": np.ones(9, dtype="int8"), - } - ) - .sort_values(["ints", "value"]) - .reset_index(drop=True) - ) - - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_project_scalar_after_join(npartitions): - left_df = dd.from_pandas(pd.DataFrame({"ints": range(3)}), npartitions=npartitions) - right_df = dd.from_pandas( - pd.DataFrame( - { - "group": [0, 1, 2] * 3, - "value": [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8], - } - ), - npartitions=npartitions, - ) - con = ibis.dask.connect({"left": left_df, "right": right_df}) - left, right = map(con.table, ("left", "right")) - - joined = left.outer_join(right, left.ints == right.group) - proj = joined[left, right.value] - expr = proj[proj.value.sum().name("sum"), ibis.literal(1).name("const")] - result = expr.execute().reset_index(drop=True) - expected = pd.DataFrame( - { - "sum": [29.0] * 9, - "const": np.ones(9, dtype="int8"), - } - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_project_list_scalar(npartitions): - df = dd.from_pandas(pd.DataFrame({"ints": range(3)}), npartitions=npartitions) - con = ibis.dask.connect({"df": df}) - table = con.table("df") - expr = table.mutate(res=table.ints.quantile([0.5, 0.95])) - result = expr.execute() - - expected = pd.Series([[1.0, 1.9] for _ in range(3)], name="res") - tm.assert_series_equal(result.res, expected) - - -def test_window_grouping_key_has_scope(t, df): - param = ibis.param(dt.string) - window = ibis.window(group_by=t.dup_strings + param) - expr = t.plain_int64.mean().over(window) - result = expr.execute(params={param: "a"}) - expected = df.groupby(df.dup_strings + "a").plain_int64.transform("mean").compute() - - tm.assert_series_equal( - result, expected.sort_index().reset_index(drop=True), check_names=False - ) - - -def test_window_on_and_by_key_as_window_input(t, df): - order_by = "plain_int64" - group_by = "dup_ints" - control = "plain_float64" - - row_window = ibis.trailing_window(order_by=order_by, group_by=group_by, preceding=1) - - # Test built-in function - - tm.assert_series_equal( - t[order_by].count().over(row_window).execute(), - t[control].count().over(row_window).execute(), - check_names=False, - ) - - tm.assert_series_equal( - t[group_by].count().over(row_window).execute(), - t[control].count().over(row_window).execute(), - check_names=False, - ) - - # Test UDF - with pytest.warns(FutureWarning, match="v9.0"): - - @reduction(input_type=[dt.int64], output_type=dt.int64) - def count(v): - return len(v) - - @reduction(input_type=[dt.int64, dt.int64], output_type=dt.int64) - def count_both(v1, v2): - return len(v1) - - tm.assert_series_equal( - count(t[order_by]).over(row_window).execute(), - t[control].count().over(row_window).execute(), - check_names=False, - ) - - tm.assert_series_equal( - count(t[group_by]).over(row_window).execute(), - t[control].count().over(row_window).execute(), - check_names=False, - ) - - tm.assert_series_equal( - count_both(t[group_by], t[order_by]).over(row_window).execute(), - t[control].count().over(row_window).execute(), - check_names=False, - ) - - -@pytest.fixture -def events(npartitions) -> dd.DataFrame: - df = pd.DataFrame( - { - "event_id": [1] * 4 + [2] * 6 + [3] * 2, - "measured_on": map( - pd.Timestamp, - map( - date, - [2021] * 12, - [6] * 4 + [5] * 6 + [7] * 2, - range(1, 13), - ), - ), - "measurement": np.nan, - } - ) - df.at[1, "measurement"] = 5.0 - df.at[4, "measurement"] = 42.0 - df.at[5, "measurement"] = 42.0 - df.at[7, "measurement"] = 11.0 - return dd.from_pandas(df, npartitions=npartitions) diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py index 992251c6b90fa..90ac4fea0c71e 100644 --- a/ibis/backends/tests/test_aggregation.py +++ b/ibis/backends/tests/test_aggregation.py @@ -538,16 +538,6 @@ def mean_and_std(v): ["impala", "mysql", "sqlite", "mssql", "druid", "oracle", "exasol"], raises=com.OperationNotDefinedError, ), - pytest.mark.notimpl( - ["dask"], - raises=(AttributeError, TypeError), - reason=( - "For 'is_in' case: 'Series' object has no attribute 'arraycollect'" - "For 'no_cond' case: TypeError: Object " - " is not " - "callable or a string" - ), - ), pytest.mark.notyet(["flink"], raises=com.OperationNotDefinedError), ], ), @@ -668,15 +658,7 @@ def test_first_last(backend, alltypes, method, filtered): @pytest.mark.notimpl( - [ - "clickhouse", - "dask", - "exasol", - "flink", - "pandas", - "pyspark", - "sqlite", - ], + ["clickhouse", "exasol", "flink", "pandas", "pyspark", "sqlite"], raises=com.UnsupportedOperationError, ) @pytest.mark.notimpl( @@ -808,11 +790,6 @@ def test_count_distinct_star(alltypes, df, ibis_cond, pandas_cond): ], raises=com.OperationNotDefinedError, ), - pytest.mark.never( - ["dask"], - reason="backend implements approximate quantiles", - raises=AssertionError, - ), pytest.mark.never( ["trino"], reason="backend implements approximate quantiles", @@ -864,11 +841,6 @@ def test_count_distinct_star(alltypes, df, ibis_cond, pandas_cond): reason="backend implements approximate quantiles", raises=AssertionError, ), - pytest.mark.never( - ["dask"], - reason="backend implements approximate quantiles", - raises=AssertionError, - ), pytest.mark.never( ["flink"], reason="backend doesn't implement approximate quantiles yet", @@ -926,11 +898,6 @@ def test_quantile( lambda t, where: t.G[where].cov(t.RBI[where], ddof=0), id="covar_pop", marks=[ - pytest.mark.notyet( - ["dask"], - reason="dask doesn't support `cov(ddof=0)` yet", - raises=com.UnsupportedOperationError, - ), pytest.mark.notimpl( ["polars", "druid"], raises=com.OperationNotDefinedError, @@ -976,11 +943,6 @@ def test_quantile( lambda t, where: t.G[where].corr(t.RBI[where]), id="corr_pop", marks=[ - pytest.mark.notyet( - ["dask"], - raises=com.UnsupportedOperationError, - reason="dask doesn't support `corr(ddof=0)` yet", - ), pytest.mark.notimpl( ["druid"], raises=com.OperationNotDefinedError, @@ -1045,11 +1007,6 @@ def test_quantile( lambda t, where: (t.G[where] > 34.0).cov(t.G[where] <= 34.0, ddof=0), id="covar_pop_bool", marks=[ - pytest.mark.notyet( - ["dask"], - raises=com.UnsupportedOperationError, - reason="dask doesn't support `cov(ddof=0)` yet", - ), pytest.mark.notimpl( ["polars", "druid"], raises=com.OperationNotDefinedError, @@ -1074,11 +1031,6 @@ def test_quantile( lambda t, where: (t.G[where] > 34.0).corr(t.G[where] <= 34.0), id="corr_pop_bool", marks=[ - pytest.mark.notyet( - ["dask"], - raises=com.UnsupportedOperationError, - reason="dask doesn't support `corr(ddof=0)` yet", - ), pytest.mark.notimpl( ["druid"], raises=com.OperationNotDefinedError, @@ -1157,7 +1109,6 @@ def test_approx_median(alltypes): ["impala", "mysql", "mssql", "druid", "trino"], raises=com.OperationNotDefinedError, ) -@pytest.mark.notyet(["dask"], raises=NotImplementedError) @pytest.mark.never( ["flink"], reason="backend doesn't implement approximate quantiles yet", @@ -1185,7 +1136,6 @@ def test_median(alltypes, df): @pytest.mark.notyet( ["pyspark"], raises=AssertionError, reason="pyspark returns null for string median" ) -@pytest.mark.notimpl(["dask"], raises=(AssertionError, NotImplementedError, TypeError)) @pytest.mark.notyet( ["snowflake"], raises=SnowflakeProgrammingError, @@ -1231,7 +1181,6 @@ def test_string_quantile(alltypes, func): raises=SnowflakeProgrammingError, reason="doesn't support median of dates", ) -@pytest.mark.notimpl(["dask"], raises=(AssertionError, NotImplementedError, TypeError)) @pytest.mark.notyet(["datafusion"], raises=Exception, reason="not supported upstream") @pytest.mark.notyet( ["polars"], raises=PolarsInvalidOperationError, reason="not supported upstream" @@ -1318,7 +1267,6 @@ def test_group_concat( [ "clickhouse", "datafusion", - "dask", "druid", "flink", "impala", @@ -1345,26 +1293,11 @@ def test_group_concat_ordered(alltypes, df, filtered): @pytest.mark.notimpl( - [ - "druid", - "exasol", - "flink", - "impala", - "mssql", - "mysql", - "oracle", - "sqlite", - ], + ["druid", "exasol", "flink", "impala", "mssql", "mysql", "oracle", "sqlite"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl( - [ - "clickhouse", - "dask", - "pandas", - "pyspark", - ], - raises=com.UnsupportedOperationError, + ["clickhouse", "pandas", "pyspark"], raises=com.UnsupportedOperationError ) @pytest.mark.parametrize( "filtered", @@ -1429,11 +1362,6 @@ def test_topk_op(alltypes, df): @pytest.mark.notyet( ["druid"], raises=PyDruidProgrammingError, reason="Java NullPointerException" ) -@pytest.mark.notimpl( - ["dask"], - raises=NotImplementedError, - reason="sorting on aggregations not yet implemented", -) @pytest.mark.notyet( ["flink"], raises=Py4JError, reason="Flink doesn't support semi joins" ) @@ -1653,7 +1581,6 @@ def test_group_concat_over_window(backend, con): backend.assert_frame_equal(result, expected) -@pytest.mark.xfail_version(dask=["dask<2024.2.0"]) def test_value_counts_on_expr(backend, alltypes, df): expr = alltypes.bigint_col.add(1).value_counts() columns = expr.columns @@ -1689,7 +1616,7 @@ def test_group_by_expr(backend, con): ibis.null("str"), marks=[ pytest.mark.notimpl( - ["pandas", "dask"], + ["pandas"], reason="nulls are discarded by default in group bys", raises=IndexError, ), diff --git a/ibis/backends/tests/test_api.py b/ibis/backends/tests/test_api.py index 4b71bdb4ffee8..d0ac2cc5d2139 100644 --- a/ibis/backends/tests/test_api.py +++ b/ibis/backends/tests/test_api.py @@ -24,7 +24,6 @@ def test_version(backend): "polars", "clickhouse", "sqlite", - "dask", "exasol", "pandas", "druid", diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 3c66946e22dd8..adc9e0dae4fe9 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -361,7 +361,6 @@ def test_unnest_no_nulls(backend): @builtin_array -@pytest.mark.notimpl("dask", raises=ValueError) @pytest.mark.notimpl( "pandas", raises=ValueError, @@ -435,7 +434,7 @@ def test_array_slice(backend, start, stop): reason="TODO(Kexiang): seems a bug", ) @pytest.mark.notimpl( - ["dask", "pandas"], + ["pandas"], raises=com.OperationNotDefinedError, reason="Operation 'ArrayMap' is not implemented for this backend", ) @@ -484,11 +483,10 @@ def test_array_map(con, input, output, func): @builtin_array @pytest.mark.notimpl( - ["dask", "datafusion", "flink", "pandas", "polars"], - raises=com.OperationNotDefinedError, + ["datafusion", "flink", "pandas", "polars"], raises=com.OperationNotDefinedError ) @pytest.mark.notimpl( - ["dask", "pandas"], + ["pandas"], raises=com.OperationNotDefinedError, reason="Operation 'ArrayMap' is not implemented for this backend", ) @@ -787,10 +785,7 @@ def test_array_union(con, a, b, expected_array): @builtin_array -@pytest.mark.notimpl( - ["dask", "pandas", "polars", "flink"], - raises=com.OperationNotDefinedError, -) +@pytest.mark.notimpl(["pandas", "polars", "flink"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl( ["sqlite"], raises=com.UnsupportedBackendType, reason="Unsupported type: Array..." ) @@ -879,7 +874,6 @@ def test_unnest_struct_with_multiple_fields(con): array_zip_notimpl = pytest.mark.notimpl( [ - "dask", "datafusion", "druid", "oracle", @@ -1158,8 +1152,7 @@ def test_unnest_empty_array(con): @builtin_array @pytest.mark.notimpl( - ["datafusion", "flink", "polars", "dask", "pandas"], - raises=com.OperationNotDefinedError, + ["datafusion", "flink", "polars", "pandas"], raises=com.OperationNotDefinedError ) @pytest.mark.notimpl(["sqlite"], raises=com.UnsupportedBackendType) @pytest.mark.notyet( @@ -1179,7 +1172,7 @@ def test_array_map_with_conflicting_names(backend, con): @builtin_array @pytest.mark.notimpl( - ["datafusion", "flink", "polars", "sqlite", "dask", "pandas", "sqlite"], + ["datafusion", "flink", "polars", "sqlite", "pandas", "sqlite"], raises=com.OperationNotDefinedError, ) def test_complex_array_map(con): @@ -1371,9 +1364,6 @@ def test_repr_timestamp_array(con, monkeypatch): @pytest.mark.notimpl( ["pandas"], raises=ValueError, reason="reindex on duplicate values" ) -@pytest.mark.notimpl( - ["dask"], raises=AssertionError, reason="DataFrame.index are different" -) def test_unnest_range(con): expr = ibis.range(2).unnest().name("x").as_table().mutate({"y": 1.0}) result = con.execute(expr) @@ -1405,7 +1395,7 @@ def test_array_literal_with_exprs(con, input, expected): @pytest.mark.notimpl( - ["datafusion", "postgres", "pandas", "polars", "risingwave", "dask", "flink"], + ["datafusion", "postgres", "pandas", "polars", "risingwave", "flink"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl( @@ -1425,8 +1415,7 @@ def test_zip_unnest_lift(con): @pytest.mark.notimpl( - ["datafusion", "pandas", "polars", "dask", "flink"], - raises=com.OperationNotDefinedError, + ["datafusion", "pandas", "polars", "flink"], raises=com.OperationNotDefinedError ) @pytest.mark.parametrize( "colspec", @@ -1441,8 +1430,7 @@ def test_table_unnest(backend, colspec): @pytest.mark.notimpl( - ["datafusion", "pandas", "polars", "dask", "flink"], - raises=com.OperationNotDefinedError, + ["datafusion", "pandas", "polars", "flink"], raises=com.OperationNotDefinedError ) def test_table_unnest_with_offset(backend): t = backend.array_types @@ -1467,8 +1455,7 @@ def test_table_unnest_with_offset(backend): @pytest.mark.notimpl( - ["datafusion", "pandas", "polars", "dask", "flink"], - raises=com.OperationNotDefinedError, + ["datafusion", "pandas", "polars", "flink"], raises=com.OperationNotDefinedError ) def test_table_unnest_with_keep_empty(con): t = ibis.memtable(pd.DataFrame({"y": [[], None, ["a"]]})) @@ -1478,8 +1465,7 @@ def test_table_unnest_with_keep_empty(con): @pytest.mark.notimpl( - ["datafusion", "pandas", "polars", "dask", "flink"], - raises=com.OperationNotDefinedError, + ["datafusion", "pandas", "polars", "flink"], raises=com.OperationNotDefinedError ) @pytest.mark.notyet( ["risingwave"], raises=PsycoPg2InternalError, reason="not supported in risingwave" @@ -1493,8 +1479,7 @@ def test_table_unnest_column_expr(backend): @pytest.mark.notimpl( - ["datafusion", "pandas", "polars", "dask", "flink"], - raises=com.OperationNotDefinedError, + ["datafusion", "pandas", "polars", "flink"], raises=com.OperationNotDefinedError ) @pytest.mark.notimpl(["trino"], raises=TrinoUserError) @pytest.mark.notimpl(["postgres"], raises=PsycoPg2SyntaxError) @@ -1522,7 +1507,7 @@ def test_table_unnest_array_of_struct_of_array(con): notimpl_aggs = pytest.mark.notimpl( - ["datafusion", "flink", "pandas", "dask"], raises=com.OperationNotDefinedError + ["datafusion", "flink", "pandas"], raises=com.OperationNotDefinedError ) diff --git a/ibis/backends/tests/test_client.py b/ibis/backends/tests/test_client.py index 8ae35e6353b7f..4875f5c07864c 100644 --- a/ibis/backends/tests/test_client.py +++ b/ibis/backends/tests/test_client.py @@ -255,7 +255,7 @@ def test_query_schema(ddl_backend, expr_fn, expected): @pytest.mark.notimpl(["mssql"]) -@pytest.mark.never(["dask", "pandas"], reason="dask and pandas do not support SQL") +@pytest.mark.never(["pandas"], reason="pandas does not support SQL") def test_sql(backend, con): # execute the expression using SQL query table = backend.format_table("functional_alltypes") @@ -347,7 +347,6 @@ def test_create_temporary_table_from_schema(con_no_data, new_schema): [ "bigquery", "clickhouse", - "dask", "datafusion", "druid", "duckdb", @@ -461,9 +460,7 @@ def employee_data_2_temp_table( con.drop_table(temp_table_name, force=True) -@pytest.mark.notimpl( - ["polars", "pandas", "dask"], reason="`insert` method not implemented" -) +@pytest.mark.notimpl(["polars", "pandas"], reason="`insert` method not implemented") def test_insert_no_overwrite_from_dataframe( backend, con, test_employee_data_2, employee_empty_temp_table ): @@ -477,9 +474,7 @@ def test_insert_no_overwrite_from_dataframe( ) -@pytest.mark.notimpl( - ["polars", "pandas", "dask"], reason="`insert` method not implemented" -) +@pytest.mark.notimpl(["polars", "pandas"], reason="`insert` method not implemented") @pytest.mark.notyet( ["risingwave"], raises=PsycoPg2InternalError, @@ -506,9 +501,7 @@ def test_insert_overwrite_from_dataframe( ) -@pytest.mark.notimpl( - ["polars", "pandas", "dask"], reason="`insert` method not implemented" -) +@pytest.mark.notimpl(["polars", "pandas"], reason="`insert` method not implemented") def test_insert_no_overwrite_from_expr( backend, con, employee_empty_temp_table, employee_data_2_temp_table ): @@ -524,9 +517,7 @@ def test_insert_no_overwrite_from_expr( ) -@pytest.mark.notimpl( - ["polars", "pandas", "dask"], reason="`insert` method not implemented" -) +@pytest.mark.notimpl(["polars", "pandas"], reason="`insert` method not implemented") @pytest.mark.notyet( ["datafusion"], raises=Exception, reason="DELETE DML not implemented upstream" ) @@ -558,9 +549,7 @@ def test_insert_overwrite_from_expr( @pytest.mark.notyet( ["trino"], reason="memory connector doesn't allow writing to tables" ) -@pytest.mark.notimpl( - ["polars", "pandas", "dask"], reason="`insert` method not implemented" -) +@pytest.mark.notimpl(["polars", "pandas"], reason="`insert` method not implemented") @pytest.mark.notyet( ["datafusion"], raises=Exception, reason="DELETE DML not implemented upstream" ) @@ -587,7 +576,7 @@ def _emp(a, b, c, d): @pytest.mark.notimpl( - ["polars", "dask", "pandas"], + ["polars", "pandas"], raises=AttributeError, reason="`insert` method not implemented", ) @@ -614,7 +603,6 @@ def test_insert_from_memtable(con, temp_table): [ "bigquery", "clickhouse", - "dask", "druid", "exasol", "impala", @@ -649,12 +637,7 @@ def test_list_catalogs(con): @pytest.mark.notyet( - [ - "dask", - "druid", - "pandas", - "polars", - ], + ["druid", "pandas", "polars"], raises=AttributeError, reason="doesn't support the common notion of a database", ) @@ -724,11 +707,6 @@ def test_unsigned_integer_type(con, temp_table): marks=mark.clickhouse, id="clickhouse", ), - param( - "dask://", - marks=mark.dask, - id="dask", - ), param( "datafusion://", marks=mark.datafusion, @@ -907,7 +885,6 @@ def test_self_join_memory_table(backend, con, monkeypatch): [ "bigquery", "clickhouse", - "dask", "duckdb", "exasol", "impala", @@ -934,7 +911,6 @@ def test_self_join_memory_table(backend, con, monkeypatch): [ "bigquery", "clickhouse", - "dask", "duckdb", "exasol", "impala", @@ -961,7 +937,6 @@ def test_self_join_memory_table(backend, con, monkeypatch): [ "bigquery", "clickhouse", - "dask", "duckdb", "exasol", "impala", @@ -1256,7 +1231,7 @@ def test_set_backend(con, monkeypatch): "name", [ param(name, marks=getattr(mark, name), id=name) - for name in ("datafusion", "duckdb", "polars", "sqlite", "pandas", "dask") + for name in ("datafusion", "duckdb", "polars", "sqlite", "pandas") ], ) def test_set_backend_name(name, monkeypatch): @@ -1299,7 +1274,6 @@ def test_set_backend_url(url, monkeypatch): @pytest.mark.notyet( [ "bigquery", - "dask", "datafusion", "duckdb", "exasol", @@ -1464,8 +1438,7 @@ def test_list_catalogs_databases(con_create_catalog_database): @pytest.mark.notyet( - ["pandas", "dask", "polars", "datafusion"], - reason="this is a no-op for in-memory backends", + ["pandas", "polars", "datafusion"], reason="this is a no-op for in-memory backends" ) @pytest.mark.notyet( ["trino", "clickhouse", "impala", "bigquery", "flink"], @@ -1572,7 +1545,7 @@ def test_schema_with_caching(alltypes): @pytest.mark.notyet( ["druid"], raises=NotImplementedError, reason="doesn't support create_table" ) -@pytest.mark.notyet(["pandas", "dask", "polars"], reason="Doesn't support insert") +@pytest.mark.notyet(["pandas", "polars"], reason="Doesn't support insert") @pytest.mark.notyet( ["datafusion"], reason="Doesn't support table creation from records" ) @@ -1618,7 +1591,7 @@ def test_insert_using_col_name_not_position(con, first_row, second_row, monkeypa @pytest.mark.parametrize("top_level", [True, False]) -@pytest.mark.never(["dask", "pandas", "polars"], reason="don't have connection concept") +@pytest.mark.never(["pandas", "polars"], reason="don't have a connection concept") def test_from_connection(con, top_level): backend = getattr(ibis, con.name) if top_level else type(con) new_con = backend.from_connection(getattr(con, CON_ATTR.get(con.name, "con"))) diff --git a/ibis/backends/tests/test_column.py b/ibis/backends/tests/test_column.py index f6b4bd8ee0f41..cb5231ce696cf 100644 --- a/ibis/backends/tests/test_column.py +++ b/ibis/backends/tests/test_column.py @@ -9,7 +9,6 @@ [ "bigquery", "clickhouse", - "dask", "datafusion", "exasol", "impala", diff --git a/ibis/backends/tests/test_dot_sql.py b/ibis/backends/tests/test_dot_sql.py index 3cae14e370172..0ac259fa7c905 100644 --- a/ibis/backends/tests/test_dot_sql.py +++ b/ibis/backends/tests/test_dot_sql.py @@ -17,9 +17,7 @@ from ibis.backends.tests.base import PYTHON_SHORT_VERSION from ibis.backends.tests.errors import GoogleBadRequest, OracleDatabaseError -dot_sql_never = pytest.mark.never( - ["dask", "pandas"], reason="dask and pandas do not accept SQL" -) +dot_sql_never = pytest.mark.never(["pandas"], reason="pandas do not accept SQL") _NAMES = { "bigquery": f"ibis_gbq_testing_{getpass.getuser()}_{PYTHON_SHORT_VERSION}.functional_alltypes", @@ -218,7 +216,7 @@ def test_dot_sql_reuse_alias_with_different_types(backend, alltypes, df): backend.assert_series_equal(foo2.x.execute(), expected2) -_NO_SQLGLOT_DIALECT = ("pandas", "dask") +_NO_SQLGLOT_DIALECT = ("pandas",) no_sqlglot_dialect = [ param(dialect, marks=pytest.mark.xfail) for dialect in sorted(_NO_SQLGLOT_DIALECT) ] diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py index 8d067959dbd1a..bffccbed7c3bb 100644 --- a/ibis/backends/tests/test_export.py +++ b/ibis/backends/tests/test_export.py @@ -29,7 +29,7 @@ limit = [ # limit not implemented for pandas-family backends - param(42, id="limit", marks=pytest.mark.notimpl(["dask", "pandas"])), + param(42, id="limit", marks=pytest.mark.notimpl(["pandas"])), ] no_limit = [param(None, id="nolimit")] @@ -138,7 +138,7 @@ def test_column_to_pyarrow_table_schema(awards_players): assert array.type == pa.string() or array.type == pa.large_string() -@pytest.mark.notimpl(["pandas", "dask", "datafusion", "flink"]) +@pytest.mark.notimpl(["pandas", "datafusion", "flink"]) @pytest.mark.notyet( ["clickhouse"], raises=AssertionError, @@ -153,7 +153,7 @@ def test_table_pyarrow_batch_chunk_size(awards_players): util.consume(batch_reader) -@pytest.mark.notimpl(["pandas", "dask", "datafusion", "flink"]) +@pytest.mark.notimpl(["pandas", "datafusion", "flink"]) @pytest.mark.notyet( ["clickhouse"], raises=AssertionError, @@ -170,7 +170,7 @@ def test_column_pyarrow_batch_chunk_size(awards_players): util.consume(batch_reader) -@pytest.mark.notimpl(["pandas", "dask"]) +@pytest.mark.notimpl(["pandas"]) @pytest.mark.notimpl( ["sqlite"], raises=pa.ArrowException, @@ -240,7 +240,6 @@ def test_table_to_parquet_writer_kwargs(version, tmp_path, backend, awards_playe [ "bigquery", "clickhouse", - "dask", "datafusion", "impala", "mssql", @@ -389,7 +388,6 @@ def test_to_pyarrow_decimal(backend, dtype, pyarrow_dtype): "snowflake", "sqlite", "bigquery", - "dask", "trino", "exasol", "druid", diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 9be4dd25a6c04..4e309170eb6ec 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -245,8 +245,7 @@ def test_coalesce(con, expr, expected): assert result == pytest.approx(expected) -# TODO(dask) - identicalTo - #2553 -@pytest.mark.notimpl(["clickhouse", "dask", "druid", "exasol"]) +@pytest.mark.notimpl(["clickhouse", "druid", "exasol"]) def test_identical_to(backend, alltypes, sorted_df): sorted_alltypes = alltypes.order_by("id") df = sorted_df @@ -364,7 +363,6 @@ def test_filter(backend, alltypes, sorted_df, predicate_fn, expected_fn): "exasol", "pandas", "pyspark", - "dask", ] ) @pytest.mark.never( @@ -562,19 +560,11 @@ def test_drop_null_table(backend, alltypes, how, subset): param("id", {"by": "id"}), param(_.id, {"by": "id"}), param(lambda _: _.id, {"by": "id"}), - param( - ibis.desc("id"), - {"by": "id", "ascending": False}, - ), - param( - ["id", "int_col"], - {"by": ["id", "int_col"]}, - marks=pytest.mark.xfail_version(dask=["dask<2024.2.0"]), - ), + param(ibis.desc("id"), {"by": "id", "ascending": False}), + param(["id", "int_col"], {"by": ["id", "int_col"]}), param( ["id", ibis.desc("int_col")], {"by": ["id", "int_col"], "ascending": [True, False]}, - marks=pytest.mark.xfail_version(dask=["dask<2024.2.0"]), ), ], ) @@ -585,7 +575,7 @@ def test_order_by(backend, alltypes, df, key, df_kwargs): backend.assert_frame_equal(result, expected) -@pytest.mark.notimpl(["dask", "pandas", "polars", "mssql", "druid"]) +@pytest.mark.notimpl(["pandas", "polars", "mssql", "druid"]) @pytest.mark.notimpl( ["risingwave"], raises=PsycoPg2InternalError, @@ -722,7 +712,7 @@ def test_order_by_two_cols_nulls(con, op1, nf1, nf2, op2, expected): getattr(t["col2"], op2)(nulls_first=nf2), ) - if (con.name in ("pandas", "dask")) and (nf1 != nf2): + if con.name == "pandas" and nf1 != nf2: with pytest.raises( ValueError, match=f"{con.name} does not support specifying null ordering for individual column", @@ -844,11 +834,6 @@ def test_table_info_large(con): raises=com.OperationNotDefinedError, reason="mode is not supported", ), - pytest.mark.notimpl( - ["dask"], - raises=ValueError, - reason="Unable to concatenate DataFrame with unknown division specifying axis=1", - ), pytest.mark.notimpl( ["oracle"], raises=(OracleDatabaseError, com.OperationNotDefinedError), @@ -931,11 +916,6 @@ def test_table_info_large(con): raises=com.OperationNotDefinedError, reason="Mode is not supported and ORA-02000: missing AS keyword", ), - pytest.mark.notimpl( - ["dask"], - raises=ValueError, - reason="Unable to concatenate DataFrame with unknown division specifying axis=1", - ), ], id="string_col", ), @@ -1170,7 +1150,7 @@ def test_int_scalar(alltypes): assert result.dtype == np.int32 -@pytest.mark.notimpl(["dask", "datafusion", "pandas", "polars", "druid"]) +@pytest.mark.notimpl(["datafusion", "pandas", "polars", "druid"]) @pytest.mark.notyet( ["clickhouse"], reason="https://github.com/ClickHouse/ClickHouse/issues/6697" ) @@ -1188,7 +1168,6 @@ def test_exists(batting, awards_players, method_name): @pytest.mark.notimpl( [ - "dask", "datafusion", "mssql", "mysql", @@ -1219,7 +1198,6 @@ def test_typeof(con): raises=PsycoPg2InternalError, reason="https://github.com/risingwavelabs/risingwave/issues/1343", ) -@pytest.mark.xfail_version(dask=["dask<2024.2.0"]) @pytest.mark.notyet( ["mssql"], raises=PyODBCProgrammingError, @@ -1245,9 +1223,6 @@ def test_isin_uncorrelated( @pytest.mark.notimpl(["polars"], reason="incorrect answer") @pytest.mark.notimpl(["druid"]) -@pytest.mark.xfail_version( - dask=["dask<2024.2.0"], reason="not supported by the backend" -) def test_isin_uncorrelated_filter( backend, batting, awards_players, batting_df, awards_players_df ): @@ -1584,7 +1559,7 @@ def test_distinct_on_keep_is_none(backend, on): assert len(result) == len(expected) -@pytest.mark.notimpl(["dask", "pandas", "risingwave", "flink", "exasol"]) +@pytest.mark.notimpl(["pandas", "risingwave", "flink", "exasol"]) @pytest.mark.notyet( [ "sqlite", @@ -1643,7 +1618,6 @@ def test_hash(backend, alltypes, dtype): @pytest.mark.notimpl(["trino", "oracle", "exasol", "snowflake"]) @pytest.mark.notyet( [ - "dask", "datafusion", "druid", "duckdb", @@ -1676,7 +1650,6 @@ def hash_256(col): [ "bigquery", "clickhouse", - "dask", "datafusion", "flink", "impala", @@ -1763,7 +1736,7 @@ def test_cast(con, from_type, to_type, from_val, expected): assert result == expected -@pytest.mark.notimpl(["pandas", "dask", "oracle", "sqlite"]) +@pytest.mark.notimpl(["pandas", "oracle", "sqlite"]) @pytest.mark.parametrize( ("from_val", "to_type", "expected"), [ @@ -1805,7 +1778,6 @@ def test_try_cast(con, from_val, to_type, expected): @pytest.mark.notimpl( [ - "dask", "datafusion", "druid", "exasol", @@ -1845,7 +1817,6 @@ def test_try_cast_null(con, from_val, to_type): @pytest.mark.notimpl( [ "pandas", - "dask", "datafusion", "druid", "mysql", @@ -1872,7 +1843,6 @@ def test_try_cast_table(backend, con): @pytest.mark.notimpl( [ "pandas", - "dask", "datafusion", "mysql", "oracle", @@ -2335,14 +2305,6 @@ def test_subsequent_overlapping_order_by(con, backend, alltypes, df): "Query could not be planned. SQL query requires ordering a table by time column" ), ) -@pytest.mark.never( - ["dask"], - raises=(AssertionError, NotImplementedError), - reason=( - "dask doesn't support deterministic .sort_values(); " - "for older dask versions sorting by multiple columns is not supported" - ), -) def test_select_sort_sort(backend, alltypes, df): t = alltypes expr = t.order_by(t.year, t.id.desc()).order_by(t.bool_col) @@ -2370,15 +2332,6 @@ def test_select_sort_sort(backend, alltypes, df): "Query could not be planned. SQL query requires ordering a table by time column" ), ) -@pytest.mark.never( - ["dask"], - raises=(AssertionError, NotImplementedError), - reason=( - "dask doesn't support deterministic .sort_values(); " - "for older dask versions sorting by multiple columns is not supported" - ), - strict=False, -) def test_select_sort_sort_deferred(backend, alltypes, df): t = alltypes @@ -2414,9 +2367,7 @@ def test_select_sort_sort_deferred(backend, alltypes, df): backend.assert_frame_equal(result, expected) -@pytest.mark.notimpl( - ["pandas", "dask"], raises=IndexError, reason="NaN isn't treated as NULL" -) +@pytest.mark.notimpl(["pandas"], raises=IndexError, reason="NaN isn't treated as NULL") @pytest.mark.notimpl( ["druid"], raises=AttributeError, @@ -2436,9 +2387,9 @@ def test_topk_counts_null(con): reason="ClickHouse returns False for x.isin([None])", ) @pytest.mark.notimpl( - ["pandas", "dask"], + ["pandas"], raises=AssertionError, - reason="null isin semantics are not implemented for pandas or dask", + reason="null isin semantics are not implemented for pandas", ) @pytest.mark.never( "mssql", @@ -2452,10 +2403,6 @@ def test_null_isin_null_is_null(con): def test_value_counts_on_tables(backend, df): - if backend.name() == "dask": - pytest.skip(reason="flaky errors about sorting on multi-partition dataframes") - from ibis import selectors as s - t = backend.functional_alltypes expr = t[["bigint_col", "int_col"]].value_counts().order_by(s.all()) result = expr.execute() diff --git a/ibis/backends/tests/test_interactive.py b/ibis/backends/tests/test_interactive.py index 276377a249d9e..388c63ab22b3f 100644 --- a/ibis/backends/tests/test_interactive.py +++ b/ibis/backends/tests/test_interactive.py @@ -33,7 +33,7 @@ def table(backend): return backend.functional_alltypes -@pytest.mark.notimpl(["dask", "pandas", "polars"]) +@pytest.mark.notimpl(["pandas", "polars"]) def test_interactive_execute_on_repr(table, queries): repr(table.bigint_col.sum()) assert len(queries) >= 1 @@ -54,21 +54,21 @@ def test_repr_png_is_not_none_in_not_interactive(table): assert table._repr_png_() is not None -@pytest.mark.notimpl(["dask", "pandas", "polars"]) +@pytest.mark.notimpl(["pandas", "polars"]) def test_default_limit(table, queries): repr(table.select("id", "bool_col")) assert len(queries) >= 1 -@pytest.mark.notimpl(["dask", "pandas", "polars"]) +@pytest.mark.notimpl(["pandas", "polars"]) def test_respect_set_limit(table, queries): repr(table.select("id", "bool_col").limit(10)) assert len(queries) >= 1 -@pytest.mark.notimpl(["dask", "pandas", "polars"]) +@pytest.mark.notimpl(["pandas", "polars"]) def test_disable_query_limit(table, queries): assert ibis.options.sql.default_limit is None diff --git a/ibis/backends/tests/test_join.py b/ibis/backends/tests/test_join.py index 13f05bfe61864..9be29ce0ec607 100644 --- a/ibis/backends/tests/test_join.py +++ b/ibis/backends/tests/test_join.py @@ -122,7 +122,7 @@ def test_mutating_join(backend, batting, awards_players, how): @pytest.mark.parametrize("how", ["semi", "anti"]) -@pytest.mark.notimpl(["dask", "druid"]) +@pytest.mark.notimpl(["druid"]) @pytest.mark.notyet(["flink"], reason="Flink doesn't support semi joins or anti joins") def test_filtering_join(backend, batting, awards_players, how): left = batting[batting.yearID == 2015] @@ -173,7 +173,6 @@ def test_mutate_then_join_no_column_overlap(batting, awards_players): @pytest.mark.notimpl(["druid"]) -@pytest.mark.notyet(["dask"], reason="dask doesn't support descending order by") @pytest.mark.notyet(["flink"], reason="Flink doesn't support semi joins") @pytest.mark.skip("risingwave") # TODO(Kexiang): RisingWave's bug, investigating @pytest.mark.parametrize( diff --git a/ibis/backends/tests/test_json.py b/ibis/backends/tests/test_json.py index 8d2b3e86e9af4..f4dc1f572fda6 100644 --- a/ibis/backends/tests/test_json.py +++ b/ibis/backends/tests/test_json.py @@ -62,7 +62,7 @@ def test_json_getitem_array(json_t): assert result == expected -@pytest.mark.notimpl(["dask", "mysql", "pandas", "risingwave"]) +@pytest.mark.notimpl(["mysql", "pandas", "risingwave"]) @pytest.mark.notyet(["bigquery", "sqlite"], reason="doesn't support maps") @pytest.mark.notyet(["postgres"], reason="only supports map") @pytest.mark.notyet( @@ -84,7 +84,7 @@ def test_json_map(backend, json_t): backend.assert_series_equal(result, expected) -@pytest.mark.notimpl(["dask", "mysql", "pandas", "risingwave"]) +@pytest.mark.notimpl(["mysql", "pandas", "risingwave"]) @pytest.mark.notyet(["sqlite"], reason="doesn't support arrays") @pytest.mark.notyet( ["pyspark", "flink"], reason="should work but doesn't deserialize JSON" @@ -106,7 +106,7 @@ def test_json_array(backend, json_t): condition=vparse(sqlite3.sqlite_version) < vparse("3.38.0"), reason="JSON not supported in SQLite < 3.38.0", ) -@pytest.mark.notimpl(["dask", "pandas", "risingwave"]) +@pytest.mark.notimpl(["pandas", "risingwave"]) @pytest.mark.notyet(["flink"], reason="should work but doesn't deserialize JSON") @pytest.mark.parametrize( ("typ", "expected_data"), diff --git a/ibis/backends/tests/test_map.py b/ibis/backends/tests/test_map.py index 2f4473830d35d..18321f7c797e7 100644 --- a/ibis/backends/tests/test_map.py +++ b/ibis/backends/tests/test_map.py @@ -41,7 +41,7 @@ @pytest.mark.notyet("clickhouse", reason="nested types can't be NULL") -@pytest.mark.notimpl(["pandas", "dask"], reason="TypeError: iteration over a 0-d array") +@pytest.mark.notimpl(["pandas"], reason="TypeError: iteration over a 0-d array") @pytest.mark.notimpl( ["risingwave"], raises=PsycoPg2InternalError, @@ -63,7 +63,7 @@ def test_map_nulls(con, k, v): @pytest.mark.notyet("clickhouse", reason="nested types can't be NULL") -@pytest.mark.notimpl(["pandas", "dask"], reason="TypeError: iteration over a 0-d array") +@pytest.mark.notimpl(["pandas"], reason="TypeError: iteration over a 0-d array") @pytest.mark.notimpl( ["risingwave"], raises=PsycoPg2InternalError, @@ -98,7 +98,7 @@ def test_map_keys_nulls(con, k, v): ), marks=[ pytest.mark.notimpl( - ["pandas", "dask"], reason="TypeError: iteration over a 0-d array" + ["pandas"], reason="TypeError: iteration over a 0-d array" ) ], id="null_values", @@ -110,7 +110,7 @@ def test_map_keys_nulls(con, k, v): ), marks=[ pytest.mark.notimpl( - ["pandas", "dask"], reason="TypeError: iteration over a 0-d array" + ["pandas"], reason="TypeError: iteration over a 0-d array" ) ], id="null_both", @@ -137,7 +137,7 @@ def test_map_values_nulls(con, map): ibis.literal(None, type="string"), marks=[ pytest.mark.notimpl( - ["pandas", "dask"], + ["pandas"], reason="result is False instead of None", strict=False, # passes for contains, but not for get ), @@ -159,7 +159,7 @@ def test_map_values_nulls(con, map): marks=[ pytest.mark.notyet("clickhouse", reason="nested types can't be NULL"), pytest.mark.notimpl( - ["pandas", "dask"], reason="TypeError: iteration over a 0-d array" + ["pandas"], reason="TypeError: iteration over a 0-d array" ), ], id="null_both_non_null_key", @@ -173,7 +173,7 @@ def test_map_values_nulls(con, map): marks=[ pytest.mark.notyet("clickhouse", reason="nested types can't be NULL"), pytest.mark.notimpl( - ["pandas", "dask"], reason="TypeError: iteration over a 0-d array" + ["pandas"], reason="TypeError: iteration over a 0-d array" ), ], id="null_both_null_key", @@ -233,14 +233,14 @@ def test_map_merge_nulls(con, m1, m2): assert con.execute(concatted) is None -@pytest.mark.notimpl(["pandas", "dask"]) +@pytest.mark.notimpl(["pandas"]) def test_map_table(backend): table = backend.map assert table.kv.type().is_map() assert not table.limit(1).execute().empty -@pytest.mark.notimpl(["pandas", "dask"]) +@pytest.mark.notimpl(["pandas"]) @pytest.mark.xfail_version( duckdb=["duckdb<0.8.0"], raises=exc.UnsupportedOperationError ) @@ -253,7 +253,7 @@ def test_column_map_values(backend): backend.assert_series_equal(result, expected) -@pytest.mark.notimpl(["pandas", "dask"]) +@pytest.mark.notimpl(["pandas"]) @pytest.mark.xfail_version( duckdb=["duckdb<0.8.0"], raises=exc.UnsupportedOperationError ) @@ -408,9 +408,7 @@ def test_literal_map_getitem_broadcast(backend, alltypes, df): pytest.mark.notyet( "clickhouse", reason="only supports str,int,bool,timestamp keys" ), - pytest.mark.notimpl( - ["pandas", "dask"], reason="DateFromYMD isn't implemented" - ), + pytest.mark.notimpl(["pandas"], reason="DateFromYMD isn't implemented"), mark_notyet_postgres, mark_notyet_snowflake, ], @@ -422,7 +420,7 @@ def test_literal_map_getitem_broadcast(backend, alltypes, df): pytest.mark.notyet( "clickhouse", reason="only supports str,int,bool,timestamp keys" ), - pytest.mark.notyet(["pandas", "dask"]), + pytest.mark.notyet(["pandas"]), mark_notyet_postgres, mark_notyet_snowflake, ], @@ -434,7 +432,7 @@ def test_literal_map_getitem_broadcast(backend, alltypes, df): pytest.mark.notyet( "clickhouse", reason="only supports str,int,bool,timestamp keys" ), - pytest.mark.notyet(["pandas", "dask"]), + pytest.mark.notyet(["pandas"]), mark_notyet_postgres, pytest.mark.notyet( ["flink"], @@ -484,9 +482,7 @@ def test_literal_map_getitem_broadcast(backend, alltypes, df): pytest.param( [ibis.date(2021, 1, 1), ibis.date(2022, 2, 2)], marks=[ - pytest.mark.notimpl( - ["pandas", "dask"], reason="DateFromYMD isn't implemented" - ), + pytest.mark.notimpl(["pandas"], reason="DateFromYMD isn't implemented"), mark_notyet_postgres, ], id="date", diff --git a/ibis/backends/tests/test_network.py b/ibis/backends/tests/test_network.py index 0947ecf6f4d20..33e1b2c997eb9 100644 --- a/ibis/backends/tests/test_network.py +++ b/ibis/backends/tests/test_network.py @@ -55,7 +55,6 @@ def test_macaddr_literal(con, backend): "pandas": "127.0.0.1", "pyspark": "127.0.0.1", "mysql": "127.0.0.1", - "dask": "127.0.0.1", "mssql": "127.0.0.1", "datafusion": "127.0.0.1", "flink": "127.0.0.1", @@ -89,7 +88,6 @@ def test_macaddr_literal(con, backend): "pandas": "2001:db8::1", "pyspark": "2001:db8::1", "mysql": "2001:db8::1", - "dask": "2001:db8::1", "mssql": "2001:db8::1", "datafusion": "2001:db8::1", "flink": "2001:db8::1", diff --git a/ibis/backends/tests/test_numeric.py b/ibis/backends/tests/test_numeric.py index 12b8a8a580966..75dd5e49106d8 100644 --- a/ibis/backends/tests/test_numeric.py +++ b/ibis/backends/tests/test_numeric.py @@ -245,7 +245,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "snowflake": decimal.Decimal("1.1"), "sqlite": decimal.Decimal("1.1"), "trino": decimal.Decimal("1.1"), - "dask": decimal.Decimal("1.1"), "exasol": decimal.Decimal("1"), "duckdb": decimal.Decimal("1.1"), "impala": decimal.Decimal("1"), @@ -298,7 +297,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "pyspark": decimal.Decimal("1.1"), "mysql": decimal.Decimal("1.1"), "clickhouse": decimal.Decimal("1.1"), - "dask": decimal.Decimal("1.1"), "mssql": decimal.Decimal("1.1"), "druid": decimal.Decimal("1.1"), "datafusion": decimal.Decimal("1.1"), @@ -327,7 +325,6 @@ def test_numeric_literal(con, backend, expr, expected_types): { "bigquery": decimal.Decimal("1.1"), "sqlite": decimal.Decimal("1.1"), - "dask": decimal.Decimal("1.1"), "postgres": decimal.Decimal("1.1"), "risingwave": decimal.Decimal("1.1"), "pandas": decimal.Decimal("1.1"), @@ -386,7 +383,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "postgres": decimal.Decimal("Infinity"), "risingwave": decimal.Decimal("Infinity"), "pandas": decimal.Decimal("Infinity"), - "dask": decimal.Decimal("Infinity"), "pyspark": decimal.Decimal("Infinity"), "exasol": float("inf"), "duckdb": float("inf"), @@ -451,7 +447,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "postgres": decimal.Decimal("-Infinity"), "risingwave": decimal.Decimal("-Infinity"), "pandas": decimal.Decimal("-Infinity"), - "dask": decimal.Decimal("-Infinity"), "pyspark": decimal.Decimal("-Infinity"), "exasol": float("-inf"), "duckdb": float("-inf"), @@ -517,7 +512,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "postgres": float("nan"), "risingwave": float("nan"), "pandas": decimal.Decimal("NaN"), - "dask": decimal.Decimal("NaN"), "pyspark": decimal.Decimal("NaN"), "exasol": float("nan"), "duckdb": float("nan"), @@ -1307,7 +1301,7 @@ def test_divide_by_zero(backend, alltypes, df, column, denominator): backend.assert_series_equal(result.astype("float64"), expected) -@pytest.mark.notimpl(["dask", "pandas", "polars"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["pandas", "polars"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl(["druid"], raises=PyDruidProgrammingError) @pytest.mark.notimpl( ["risingwave"], diff --git a/ibis/backends/tests/test_register.py b/ibis/backends/tests/test_register.py index cdfa1683743f2..05c2f9f5b9360 100644 --- a/ibis/backends/tests/test_register.py +++ b/ibis/backends/tests/test_register.py @@ -85,7 +85,6 @@ def gzip_csv(data_dir, tmp_path): [ "bigquery", "clickhouse", - "dask", "flink", "impala", "mssql", @@ -114,7 +113,6 @@ def test_register_csv(con, data_dir, fname, in_table_name, out_table_name): [ "bigquery", "clickhouse", - "dask", "flink", "impala", "mssql", @@ -140,7 +138,6 @@ def test_register_csv_gz(con, data_dir, gzip_csv): [ "bigquery", "clickhouse", - "dask", "flink", "impala", "mssql", @@ -199,7 +196,6 @@ def read_table(path: Path) -> Iterator[tuple[str, pa.Table]]: [ "bigquery", "clickhouse", - "dask", "flink", "impala", "mssql", @@ -237,7 +233,6 @@ def test_register_parquet( [ "bigquery", "clickhouse", - "dask", "datafusion", "flink", "impala", @@ -285,7 +280,6 @@ def test_register_iterator_parquet( [ "bigquery", "clickhouse", - "dask", "flink", "impala", "mssql", @@ -321,7 +315,6 @@ def test_register_pandas(con): [ "bigquery", "clickhouse", - "dask", "flink", "impala", "mssql", @@ -348,7 +341,6 @@ def test_register_pyarrow_tables(con): [ "bigquery", "clickhouse", - "dask", "flink", "impala", "mssql", @@ -388,7 +380,6 @@ def test_csv_reregister_schema(con, tmp_path): [ "bigquery", "clickhouse", - "dask", "datafusion", "flink", "impala", @@ -517,7 +508,6 @@ def test_read_csv_glob(con, tmp_path, ft_data): @pytest.mark.notyet( [ "clickhouse", - "dask", "datafusion", "impala", "mssql", diff --git a/ibis/backends/tests/test_set_ops.py b/ibis/backends/tests/test_set_ops.py index adb08a93c34f8..d86bf0d964f29 100644 --- a/ibis/backends/tests/test_set_ops.py +++ b/ibis/backends/tests/test_set_ops.py @@ -72,7 +72,6 @@ def test_union_mixed_distinct(backend, union_subsets): [ "impala", "bigquery", - "dask", "pandas", "sqlite", "snowflake", @@ -126,7 +125,6 @@ def test_intersect(backend, alltypes, df, distinct): [ "impala", "bigquery", - "dask", "pandas", "sqlite", "snowflake", @@ -227,7 +225,6 @@ def test_top_level_union(backend, con, alltypes, distinct, ordered): [ "impala", "bigquery", - "dask", "mssql", "pandas", "snowflake", diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index 4a299f8a83c12..b5cf436e26e5e 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -42,12 +42,12 @@ ), ], ) -@pytest.mark.never(["pandas", "dask", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) def test_literal(backend, expr): assert "432" in ibis.to_sql(expr, dialect=backend.name()) -@pytest.mark.never(["pandas", "dask", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) def test_group_by_has_index(backend, snapshot): countries = ibis.table( dict(continent="string", population="int64"), name="countries" @@ -70,7 +70,7 @@ def test_group_by_has_index(backend, snapshot): snapshot.assert_match(sql, "out.sql") -@pytest.mark.never(["pandas", "dask", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) def test_cte_refs_in_topo_order(backend, snapshot): mr0 = ibis.table(schema=ibis.schema(dict(key="int")), name="leaf") @@ -83,7 +83,7 @@ def test_cte_refs_in_topo_order(backend, snapshot): snapshot.assert_match(sql, "out.sql") -@pytest.mark.never(["pandas", "dask", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) def test_isin_bug(con, snapshot): t = ibis.table(dict(x="int"), name="t") good = t[t.x > 2].x @@ -91,7 +91,7 @@ def test_isin_bug(con, snapshot): snapshot.assert_match(str(ibis.to_sql(expr, dialect=con.name)), "out.sql") -@pytest.mark.never(["pandas", "dask", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) @pytest.mark.notyet( ["exasol", "oracle", "flink"], reason="no unnest support", @@ -156,7 +156,19 @@ def test_union_aliasing(backend_name, snapshot): snapshot.assert_match(str(ibis.to_sql(result, dialect=backend_name)), "out.sql") -@pytest.mark.never(["pandas", "dask", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) +def test_union_generates_predictable_aliases(con): + t = ibis.memtable( + data=[{"island": "Torgerson", "body_mass_g": 3750, "sex": "male"}] + ) + sub1 = t.inner_join(t.view(), "island").mutate(island_right=lambda t: t.island) + sub2 = t.inner_join(t.view(), "sex").mutate(sex_right=lambda t: t.sex) + expr = ibis.union(sub1, sub2) + df = con.execute(expr) + assert len(df) == 2 + + +@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=NotImplementedError) @pytest.mark.parametrize( "value", [ @@ -180,9 +192,7 @@ def test_selects_with_impure_operations_not_merged(con, snapshot, value): snapshot.assert_match(sql, "out.sql") -@pytest.mark.never( - ["pandas", "dask", "polars"], reason="not SQL", raises=NotImplementedError -) +@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=NotImplementedError) def test_to_sql_default_backend(con, snapshot, monkeypatch): monkeypatch.setattr(ibis.options, "default_backend", con) @@ -192,7 +202,7 @@ def test_to_sql_default_backend(con, snapshot, monkeypatch): @pytest.mark.notimpl( - ["dask", "pandas", "polars"], raises=ValueError, reason="not a SQL backend" + ["pandas", "polars"], raises=ValueError, reason="not a SQL backend" ) def test_many_subqueries(backend_name, snapshot): def query(t, group_cols): diff --git a/ibis/backends/tests/test_string.py b/ibis/backends/tests/test_string.py index 7df32ed50b419..6fd0b9d12943e 100644 --- a/ibis/backends/tests/test_string.py +++ b/ibis/backends/tests/test_string.py @@ -760,7 +760,7 @@ def test_substr_with_null_values(backend, alltypes, df): id="file", marks=[ pytest.mark.notimpl( - ["pandas", "dask", "datafusion", "sqlite"], + ["pandas", "datafusion", "sqlite"], raises=com.OperationNotDefinedError, ), ], @@ -837,7 +837,6 @@ def test_capitalize(con, inp, expected): @pytest.mark.notimpl( [ - "dask", "pandas", "polars", "oracle", @@ -879,7 +878,6 @@ def test_multiple_subs(con): @pytest.mark.notimpl( [ "clickhouse", - "dask", "druid", "impala", "mssql", @@ -928,7 +926,6 @@ def test_non_match_regex_search_is_false(con): @pytest.mark.notimpl( [ - "dask", "impala", "mysql", "sqlite", @@ -951,7 +948,6 @@ def test_re_split(con): @pytest.mark.notimpl( [ - "dask", "impala", "mysql", "sqlite", @@ -973,7 +969,6 @@ def test_re_split_column(alltypes): @pytest.mark.notimpl( [ - "dask", "impala", "mysql", "sqlite", @@ -1021,7 +1016,7 @@ def test_re_split_column_multiple_patterns(alltypes): [lambda n: n + "a", lambda n: n + n, lambda n: "a" + n], ids=["null-a", "null-null", "a-null"], ) -@pytest.mark.notimpl(["pandas", "dask"], raises=TypeError) +@pytest.mark.notimpl(["pandas"], raises=TypeError) def test_concat_with_null(con, fn): null = ibis.literal(None, type="string") expr = fn(null) @@ -1043,7 +1038,7 @@ def test_concat_with_null(con, fn): [lambda args: args[0].concat(*args[1:]), lambda args: reduce(add, args)], ids=["concat", "add"], ) -@pytest.mark.notimpl(["pandas", "dask"], raises=TypeError) +@pytest.mark.notimpl(["pandas"], raises=TypeError) def test_concat(con, args, method): expr = method(args) assert pd.isna(con.execute(expr)) diff --git a/ibis/backends/tests/test_struct.py b/ibis/backends/tests/test_struct.py index e1ba89162b2a0..5880e20cde025 100644 --- a/ibis/backends/tests/test_struct.py +++ b/ibis/backends/tests/test_struct.py @@ -28,7 +28,6 @@ ] -@pytest.mark.notimpl(["dask"]) @pytest.mark.parametrize( ("field", "expected"), [ @@ -55,7 +54,6 @@ def test_single_field(struct, field, expected): tm.assert_series_equal(result.field, pd.Series(expected, name="field")) -@pytest.mark.notimpl(["dask"]) def test_all_fields(struct, struct_df): result = struct.abc.execute() expected = struct_df.abc @@ -251,7 +249,7 @@ def test_keyword_fields(con, nullable): ) @pytest.mark.notimpl( # https://github.com/pandas-dev/pandas/issues/58909 - ["pandas", "dask"], + ["pandas"], raises=TypeError, reason="unhashable type: 'dict'", ) diff --git a/ibis/backends/tests/test_temporal.py b/ibis/backends/tests/test_temporal.py index 56c76125ea805..cc0b06fe6c353 100644 --- a/ibis/backends/tests/test_temporal.py +++ b/ibis/backends/tests/test_temporal.py @@ -232,7 +232,7 @@ def test_timestamp_extract_milliseconds(backend, alltypes, df): reason="UNIX_SECONDS does not support DATETIME arguments", ) @pytest.mark.notimpl( - ["dask", "pandas"], + ["pandas"], raises=AssertionError, condition=is_older_than("pandas", "2.0.0"), ) @@ -466,11 +466,6 @@ def test_date_truncate(backend, alltypes, df, unit): raises=TypeError, reason="duration() got an unexpected keyword argument 'years'", ), - pytest.mark.notimpl( - ["dask"], - raises=ValueError, - reason="Metadata inference failed in `add`.", - ), pytest.mark.notyet( ["trino"], raises=com.UnsupportedOperationError, @@ -484,11 +479,6 @@ def test_date_truncate(backend, alltypes, df, unit): pd.offsets.DateOffset, # TODO - DateOffset - #2553 marks=[ - pytest.mark.notimpl( - ["dask"], - raises=ValueError, - reason="Metadata inference failed in `add`.", - ), pytest.mark.notimpl( ["polars"], raises=TypeError, @@ -506,11 +496,6 @@ def test_date_truncate(backend, alltypes, df, unit): pd.offsets.DateOffset, # TODO - DateOffset - #2553 marks=[ - pytest.mark.notimpl( - ["dask"], - raises=ValueError, - reason="Metadata inference failed in `add`.", - ), pytest.mark.notyet( ["trino"], raises=com.UnsupportedOperationError, @@ -957,11 +942,6 @@ def test_timestamp_comparison_filter(backend, con, alltypes, df, func_name): no_mixed_timestamp_comparisons = [ - pytest.mark.notimpl( - ["dask"], - raises=ValueError, - reason="Metadata inference failed in `gt`.", - ), pytest.mark.notimpl( ["pandas"], raises=TypeError, @@ -1238,15 +1218,7 @@ def test_integer_to_timestamp(backend, con, unit): ], ) @pytest.mark.notimpl( - [ - "dask", - "pandas", - "clickhouse", - "sqlite", - "datafusion", - "mssql", - "druid", - ], + ["pandas", "clickhouse", "sqlite", "datafusion", "mssql", "druid"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl(["exasol"], raises=com.OperationNotDefinedError) @@ -1317,15 +1289,7 @@ def test_string_to_timestamp(alltypes, fmt): ], ) @pytest.mark.notimpl( - [ - "dask", - "pandas", - "clickhouse", - "sqlite", - "datafusion", - "mssql", - "druid", - ], + ["pandas", "clickhouse", "sqlite", "datafusion", "mssql", "druid"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl(["exasol"], raises=com.OperationNotDefinedError) @@ -1465,7 +1429,7 @@ def test_today_from_projection(alltypes): @pytest.mark.notimpl( - ["pandas", "dask", "exasol", "risingwave", "druid"], + ["pandas", "exasol", "risingwave", "druid"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl( @@ -1496,7 +1460,7 @@ def test_date_literal(con, backend): @pytest.mark.notimpl( - ["pandas", "dask", "pyspark", "mysql", "exasol", "oracle"], + ["pandas", "pyspark", "mysql", "exasol", "oracle"], raises=com.OperationNotDefinedError, ) @pytest.mark.notyet(["impala"], raises=com.OperationNotDefinedError) @@ -1513,7 +1477,7 @@ def test_timestamp_literal(con, backend): @pytest.mark.notimpl( - ["pandas", "mysql", "dask", "pyspark", "exasol"], + ["pandas", "mysql", "pyspark", "exasol"], raises=com.OperationNotDefinedError, ) @pytest.mark.notyet(["impala", "oracle"], raises=com.OperationNotDefinedError) @@ -1574,7 +1538,7 @@ def test_timestamp_with_timezone_literal(con, timezone, expected): @pytest.mark.notimpl( - ["pandas", "datafusion", "dask", "pyspark", "polars", "mysql", "oracle"], + ["pandas", "datafusion", "pyspark", "polars", "mysql", "oracle"], raises=com.OperationNotDefinedError, ) @pytest.mark.notyet( @@ -1700,8 +1664,7 @@ def test_interval_literal(con, backend): @pytest.mark.notimpl( - ["pandas", "dask", "exasol", "risingwave", "druid"], - raises=com.OperationNotDefinedError, + ["pandas", "exasol", "risingwave", "druid"], raises=com.OperationNotDefinedError ) @pytest.mark.notimpl( ["oracle"], raises=OracleDatabaseError, reason="ORA-00936: missing expression" @@ -1717,8 +1680,7 @@ def test_date_column_from_ymd(backend, con, alltypes, df): @pytest.mark.notimpl( - ["pandas", "dask", "pyspark", "mysql", "exasol"], - raises=com.OperationNotDefinedError, + ["pandas", "pyspark", "mysql", "exasol"], raises=com.OperationNotDefinedError ) @pytest.mark.notyet(["impala", "oracle"], raises=com.OperationNotDefinedError) def test_timestamp_column_from_ymdhms(backend, con, alltypes, df): @@ -1975,15 +1937,7 @@ def test_timestamp_precision_output(con, ts, scale, unit): @pytest.mark.notimpl( - [ - "dask", - "datafusion", - "druid", - "impala", - "oracle", - "pandas", - "polars", - ], + ["datafusion", "druid", "impala", "oracle", "pandas", "polars"], raises=com.OperationNotDefinedError, ) @pytest.mark.parametrize( @@ -2039,7 +1993,7 @@ def test_delta(con, start, end, unit, expected): @pytest.mark.notimpl( - ["dask", "impala", "mysql", "pandas", "pyspark", "sqlite", "trino", "druid"], + ["impala", "mysql", "pandas", "pyspark", "sqlite", "trino", "druid"], raises=com.OperationNotDefinedError, ) @pytest.mark.parametrize( @@ -2142,7 +2096,6 @@ def test_timestamp_bucket(backend, kws, pd_freq): @pytest.mark.notimpl( [ - "dask", "datafusion", "impala", "mysql", @@ -2178,7 +2131,7 @@ def test_timestamp_bucket_offset(backend, offset_mins): backend.assert_series_equal(res, sol) -_NO_SQLGLOT_DIALECT = ("pandas", "dask", "flink", "polars") +_NO_SQLGLOT_DIALECT = ("pandas", "flink", "polars") no_sqlglot_dialect = sorted( param(backend, marks=pytest.mark.xfail) for backend in _NO_SQLGLOT_DIALECT ) @@ -2248,7 +2201,7 @@ def test_time_literal_sql(dialect, snapshot, micros): ), pytest.mark.notyet(["datafusion"], raises=Exception), pytest.mark.notyet( - ["pandas", "dask"], + ["pandas"], condition=is_older_than("pandas", "2.0.0"), raises=ValueError, reason="Out of bounds nanosecond timestamp: 9999-01-02 00:00:00", @@ -2267,7 +2220,7 @@ def test_time_literal_sql(dialect, snapshot, micros): ), pytest.mark.notyet(["datafusion"], raises=Exception), pytest.mark.notyet( - ["pandas", "dask"], + ["pandas"], condition=is_older_than("pandas", "2.0.0"), raises=ValueError, reason="Out of bounds nanosecond timestamp: 1-07-17 00:00:00", diff --git a/ibis/backends/tests/test_udf.py b/ibis/backends/tests/test_udf.py index 4fc2e8898cfe1..ffdc6ca2437e3 100644 --- a/ibis/backends/tests/test_udf.py +++ b/ibis/backends/tests/test_udf.py @@ -12,7 +12,6 @@ [ "bigquery", "clickhouse", - "dask", "druid", "exasol", "impala", diff --git a/ibis/backends/tests/test_uuid.py b/ibis/backends/tests/test_uuid.py index 8e1798a95fad9..8768b0e137e01 100644 --- a/ibis/backends/tests/test_uuid.py +++ b/ibis/backends/tests/test_uuid.py @@ -42,7 +42,7 @@ def test_uuid_literal(con, backend): @pytest.mark.notimpl( - ["druid", "exasol", "oracle", "polars", "pyspark", "risingwave", "pandas", "dask"], + ["druid", "exasol", "oracle", "polars", "pyspark", "risingwave", "pandas"], raises=com.OperationNotDefinedError, ) @pytest.mark.never( @@ -55,7 +55,7 @@ def test_uuid_function(con): @pytest.mark.notimpl( - ["druid", "exasol", "oracle", "polars", "pyspark", "risingwave", "pandas", "dask"], + ["druid", "exasol", "oracle", "polars", "pyspark", "risingwave", "pandas"], raises=com.OperationNotDefinedError, ) def test_uuid_unique_each_row(con): diff --git a/ibis/backends/tests/test_vectorized_udf.py b/ibis/backends/tests/test_vectorized_udf.py index 1973b1c10439d..a893df9ab18fb 100644 --- a/ibis/backends/tests/test_vectorized_udf.py +++ b/ibis/backends/tests/test_vectorized_udf.py @@ -55,9 +55,7 @@ def add_one_udf(s: float) -> float: yield param(add_one_legacy, id=f"add_one_legacy_{id}") yield param( - add_one_udf, - marks=[pytest.mark.notimpl(["pandas", "dask"])], - id=f"add_one_modern_{id}", + add_one_udf, marks=[pytest.mark.notimpl(["pandas"])], id=f"add_one_modern_{id}" ) @@ -620,7 +618,6 @@ def test_elementwise_udf_struct(udf_backend, udf_alltypes): @pytest.mark.parametrize("udf", demean_struct_udfs) @pytest.mark.notimpl(["pyspark"]) -@pytest.mark.notimpl(["dask"], strict=False) def test_analytic_udf_destruct(udf_backend, udf_alltypes, udf): w = ibis.window(preceding=None, following=None, group_by="year") diff --git a/ibis/backends/tests/test_window.py b/ibis/backends/tests/test_window.py index 80756473f9ba7..b811f386ef873 100644 --- a/ibis/backends/tests/test_window.py +++ b/ibis/backends/tests/test_window.py @@ -153,8 +153,7 @@ def calc_zscore(s): id="ntile", marks=[ pytest.mark.notimpl( - ["dask", "pandas", "polars"], - raises=com.OperationNotDefinedError, + ["pandas", "polars"], raises=com.OperationNotDefinedError ), pytest.mark.notimpl( ["impala"], @@ -199,7 +198,6 @@ def calc_zscore(s): pytest.mark.notyet( ["impala", "mssql"], raises=com.OperationNotDefinedError ), - pytest.mark.notimpl(["dask"], raises=com.OperationNotDefinedError), pytest.mark.notimpl(["flink"], raises=com.OperationNotDefinedError), pytest.mark.notimpl(["risingwave"], raises=PsycoPg2InternalError), ], @@ -599,7 +597,6 @@ def test_grouped_unbounded_window( ], ) @pytest.mark.notimpl(["snowflake"], raises=AssertionError) -@pytest.mark.notimpl(["dask"], raises=AssertionError) @pytest.mark.notyet(["mssql"], raises=PyODBCProgrammingError) @pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl( @@ -677,9 +674,7 @@ def test_simple_ungrouped_window_with_scalar_order_by(alltypes): True, id="unordered-ntile", marks=[ - pytest.mark.notimpl( - ["pandas", "dask"], raises=com.OperationNotDefinedError - ), + pytest.mark.notimpl(["pandas"], raises=com.OperationNotDefinedError), pytest.mark.notimpl( ["risingwave"], raises=PsycoPg2InternalError, diff --git a/ibis/config.py b/ibis/config.py index ca6140afec264..ff1d1770160ff 100644 --- a/ibis/config.py +++ b/ibis/config.py @@ -148,8 +148,6 @@ class Options(Config): SQL-related options. clickhouse : Config | None Clickhouse specific options. - dask : Config | None - Dask specific options. impala : Config | None Impala specific options. pandas : Config | None @@ -167,7 +165,6 @@ class Options(Config): default_backend: Optional[Any] = None sql: SQL = SQL() clickhouse: Optional[Config] = None - dask: Optional[Config] = None impala: Optional[Config] = None pandas: Optional[Config] = None pyspark: Optional[Config] = None diff --git a/ibis/expr/tests/test_schema.py b/ibis/expr/tests/test_schema.py index 3bd6f059e83bf..4cd176d67f58a 100644 --- a/ibis/expr/tests/test_schema.py +++ b/ibis/expr/tests/test_schema.py @@ -452,17 +452,8 @@ def test_schema_from_to_numpy_dtypes(): assert restored_dtypes == expected_dtypes -@pytest.mark.parametrize( - ("from_method", "to_method"), - [ - pytest.param( - "from_pandas", - "to_pandas", - marks=pytest.mark.skipif(not has_pandas, reason="pandas not installed"), - ), - ], -) -def test_schema_from_to_pandas_dask_dtypes(from_method, to_method): +def test_schema_from_to_pandas_dtypes(): + pd = pytest.importorskip("pandas") pandas_schema = pd.Series( [ ("a", np.dtype("int64")), @@ -471,7 +462,7 @@ def test_schema_from_to_pandas_dask_dtypes(from_method, to_method): ("d", pd.DatetimeTZDtype(tz="US/Eastern", unit="ns")), ] ) - ibis_schema = getattr(sch.Schema, from_method)(pandas_schema) + ibis_schema = sch.Schema.from_pandas(pandas_schema) assert ibis_schema == sch.schema(pandas_schema) expected = sch.Schema( @@ -484,7 +475,7 @@ def test_schema_from_to_pandas_dask_dtypes(from_method, to_method): ) assert ibis_schema == expected - restored_dtypes = getattr(ibis_schema, to_method)() + restored_dtypes = ibis_schema.to_pandas() expected_dtypes = [ ("a", np.dtype("int64")), ("b", np.dtype("object")), diff --git a/ibis/expr/types/generic.py b/ibis/expr/types/generic.py index 473b74fc5d4c9..5292dc672ff4f 100644 --- a/ibis/expr/types/generic.py +++ b/ibis/expr/types/generic.py @@ -1359,7 +1359,7 @@ def _repr_html_(self) -> str | None: @public class Column(Value, _FixedTextJupyterMixin): - # Higher than numpy & dask objects + # Higher than numpy objects __array_priority__ = 20 __array_ufunc__ = None diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index ae9511740cdff..72f9eccea1774 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -170,7 +170,7 @@ class Table(Expr, _FixedTextJupyterMixin): info. """ - # Higher than numpy & dask objects + # Higher than numpy objects __array_priority__ = 20 __array_ufunc__ = None diff --git a/ibis/tests/benchmarks/test_benchmarks.py b/ibis/tests/benchmarks/test_benchmarks.py index 58bad8be2f16b..949e2c661a95a 100644 --- a/ibis/tests/benchmarks/test_benchmarks.py +++ b/ibis/tests/benchmarks/test_benchmarks.py @@ -161,7 +161,7 @@ def test_builtins(benchmark, expr_fn, builtin, t, base, large_expr): _backends = _get_backend_names(exclude=("pandas",)) -_XFAIL_COMPILE_BACKENDS = ("dask", "polars") +_XFAIL_COMPILE_BACKENDS = ("polars",) @pytest.mark.benchmark(group="compilation") diff --git a/nix/ibis.nix b/nix/ibis.nix index 61ea5c4ad0a8e..7a507c35da80b 100644 --- a/nix/ibis.nix +++ b/nix/ibis.nix @@ -9,9 +9,7 @@ let # pyspark could be added here, but it doesn't handle parallel test execution # well and serially it takes on the order of 7-8 minutes to execute serially - backends = [ "datafusion" "duckdb" "pandas" "polars" "sqlite" ] - # dask version has a show-stopping bug for Python >=3.11 - ++ lib.optionals (python3.pythonOlder "3.11") [ "dask" ]; + backends = [ "datafusion" "duckdb" "pandas" "polars" "sqlite" ]; markers = lib.concatStringsSep " or " (backends ++ [ "core" ]); in poetry2nix.mkPoetryApplication rec { diff --git a/poetry.lock b/poetry.lock index 8d31459269408..fb341f0dd3abd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1306,37 +1306,6 @@ files = [ docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] tests = ["pytest", "pytest-cov", "pytest-xdist"] -[[package]] -name = "dask" -version = "2024.2.1" -description = "Parallel PyData with Task Scheduling" -optional = true -python-versions = ">=3.9" -files = [ - {file = "dask-2024.2.1-py3-none-any.whl", hash = "sha256:a13fcdeead3bab3576495023f83097adcffe2f03c371c241b5a1f0b232b35b38"}, - {file = "dask-2024.2.1.tar.gz", hash = "sha256:9504a1e9f5d8e5403fae931f9f1660d41f510f48895ccefce856ec6a4c2198d8"}, -] - -[package.dependencies] -click = ">=8.1" -cloudpickle = ">=1.5.0" -fsspec = ">=2021.09.0" -importlib-metadata = ">=4.13.0" -numpy = {version = ">=1.21", optional = true, markers = "extra == \"array\""} -packaging = ">=20.0" -pandas = {version = ">=1.3", optional = true, markers = "extra == \"dataframe\""} -partd = ">=1.2.0" -pyyaml = ">=5.3.1" -toolz = ">=0.10.0" - -[package.extras] -array = ["numpy (>=1.21)"] -complete = ["dask[array,dataframe,diagnostics,distributed]", "lz4 (>=4.3.2)", "pyarrow (>=7.0)", "pyarrow-hotfix"] -dataframe = ["dask[array]", "pandas (>=1.3)"] -diagnostics = ["bokeh (>=2.4.2)", "jinja2 (>=2.10.3)"] -distributed = ["distributed (==2024.2.1)"] -test = ["pandas[test]", "pre-commit", "pytest", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist"] - [[package]] name = "datafusion" version = "39.0.0" @@ -3322,17 +3291,6 @@ files = [ {file = "kiwisolver-1.4.5.tar.gz", hash = "sha256:e57e563a57fb22a142da34f38acc2fc1a5c864bc29ca1517a88abc963e60d6ec"}, ] -[[package]] -name = "locket" -version = "1.0.0" -description = "File-based locks for Python on Linux and Windows" -optional = true -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "locket-1.0.0-py2.py3-none-any.whl", hash = "sha256:b6c819a722f7b6bd955b80781788e4a66a55628b858d347536b7e81325a3a5e3"}, - {file = "locket-1.0.0.tar.gz", hash = "sha256:5c0d4c052a8bbbf750e056a8e65ccd309086f4f0f18a2eac306a8dfa4112a632"}, -] - [[package]] name = "lonboard" version = "0.9.3" @@ -4176,24 +4134,6 @@ files = [ {file = "parsy-2.1.tar.gz", hash = "sha256:fd5dd18d7b0b61f8275ee88665f430a20c02cf5a82d88557f35330530186d7ac"}, ] -[[package]] -name = "partd" -version = "1.4.2" -description = "Appendable key-value storage" -optional = true -python-versions = ">=3.9" -files = [ - {file = "partd-1.4.2-py3-none-any.whl", hash = "sha256:978e4ac767ec4ba5b86c6eaa52e5a2a3bc748a2ca839e8cc798f1cc6ce6efb0f"}, - {file = "partd-1.4.2.tar.gz", hash = "sha256:d022c33afbdc8405c226621b015e8067888173d85f7f5ecebb3cafed9a20f02c"}, -] - -[package.dependencies] -locket = "*" -toolz = "*" - -[package.extras] -complete = ["blosc", "numpy (>=1.20.0)", "pandas (>=1.3)", "pyzmq"] - [[package]] name = "pathspec" version = "0.12.1" @@ -7742,7 +7682,6 @@ cffi = ["cffi (>=1.11)"] [extras] bigquery = ["db-dtypes", "google-cloud-bigquery", "google-cloud-bigquery-storage", "pyarrow", "pyarrow-hotfix", "pydata-google-auth"] clickhouse = ["clickhouse-connect", "pyarrow", "pyarrow-hotfix"] -dask = ["dask", "packaging", "pyarrow", "pyarrow-hotfix", "regex"] datafusion = ["datafusion", "pyarrow", "pyarrow-hotfix"] decompiler = ["black"] deltalake = ["deltalake"] @@ -7769,4 +7708,4 @@ visualization = ["graphviz"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "03717a8b1d7b3944ba9e69f6ce2d3b023f406674f7d1b5dca610d85754a4c6ba" +content-hash = "d4499f29930cbaff7e95eef26dcea83c8eb1dfefcc45402913fe0bd0af39827f" diff --git a/pyproject.toml b/pyproject.toml index 843b9bb8be31d..6b11705683fd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,10 +55,6 @@ clickhouse-connect = { version = ">=0.5.23,<1", optional = true, extras = [ "numpy", "pandas", ] } -dask = { version = ">=2022.9.1,<2024.3.0", optional = true, extras = [ - "array", - "dataframe", -] } datafusion = { version = ">=0.6,<40", optional = true } db-dtypes = { version = ">=0.3,<2", optional = true } deltalake = { version = ">=0.9.0,<1", optional = true } @@ -82,7 +78,7 @@ pyexasol = { version = ">=0.25.2,<1", optional = true, extras = ["pandas"] } pymysql = { version = ">=1,<2", optional = true } pyodbc = { version = ">=4.0.39,<6", optional = true } pyspark = { version = ">=3.3.3,<4", optional = true } -# used to support posix regexen in the pandas, dask and sqlite backends +# used to support posix regexen in the pandas and sqlite backends regex = { version = ">=2021.7.6", optional = true } shapely = { version = ">=2,<3", optional = true } # we don't support arbitrarily old versions of this library due to security @@ -152,7 +148,6 @@ bigquery = [ "pydata-google-auth", ] clickhouse = ["clickhouse-connect", "pyarrow", "pyarrow-hotfix"] -dask = ["dask", "regex", "packaging", "pyarrow", "pyarrow-hotfix"] datafusion = ["datafusion", "pyarrow", "pyarrow-hotfix"] druid = ["pydruid", "pyarrow", "pyarrow-hotfix"] duckdb = ["duckdb", "pyarrow", "pyarrow-hotfix"] @@ -180,7 +175,6 @@ geospatial = ["geoarrow-types", "geopandas", "pyproj", "shapely"] [tool.poetry.plugins."ibis.backends"] bigquery = "ibis.backends.bigquery" clickhouse = "ibis.backends.clickhouse" -dask = "ibis.backends.dask" datafusion = "ibis.backends.datafusion" druid = "ibis.backends.druid" duckdb = "ibis.backends.duckdb" @@ -238,19 +232,6 @@ filterwarnings = [ "ignore:is_datetime64tz_dtype is deprecated and will be removed in a future version:DeprecationWarning", # pyspark and impala leave sockets open "ignore:Exception ignored in:", - # dask - "ignore:Using the ``in`` operator to test for membership in Series is deprecated:FutureWarning", - "ignore:In a future version of pandas, a length 1 tuple will be returned when iterating over a groupby:FutureWarning", - "ignore:index is deprecated and will be removed in a future release:FutureWarning", - "ignore:`meta` is not specified:UserWarning", - "ignore:Concatenating dataframes with unknown divisions:UserWarning", - "ignore:Possible nested set at position:FutureWarning", - 'ignore:\s+You did not provide metadata:UserWarning', - "ignore:Minimal version of pyarrow will soon be increased:FutureWarning", - # Dask deprecation warning - switch to dask-expr - "ignore:The current Dask DataFrame implementation is deprecated:DeprecationWarning", - # numpy by way of dask - 'ignore:np\.find_common_type is deprecated:DeprecationWarning', # pandas "ignore:Boolean Series key will be reindexed:UserWarning", 'ignore:Using \.astype to convert from timezone-(naive|aware) dtype:FutureWarning', @@ -322,7 +303,6 @@ markers = [ "never: The backend will never support this / pass this test. Don't bother trying to fix it", "bigquery: BigQuery tests", "clickhouse: ClickHouse tests", - "dask: Dask tests", "datafusion: Apache Datafusion tests", "druid: Apache Druid tests", "duckdb: DuckDB tests", diff --git a/requirements-dev.txt b/requirements-dev.txt index ab76459ec9799..502750edb1132 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -34,7 +34,7 @@ charset-normalizer==3.3.2 ; python_version >= "3.10" and python_version < "4.0" cleo==2.1.0 ; python_version >= "3.10" and python_version < "4.0" click==8.1.7 ; python_version >= "3.10" and python_version < "4.0" clickhouse-connect[arrow,numpy,pandas]==0.7.18 ; python_version >= "3.10" and python_version < "4.0" -cloudpickle==3.0.0 ; python_version >= "3.10" and python_version < "4.0" +cloudpickle==3.0.0 ; python_version >= "3.10" and python_version < "3.13" codespell[hard-encoding-detection,toml]==2.3.0 ; python_version >= "3.10" and python_version < "4.0" colorama==0.4.6 ; python_version >= "3.10" and python_version < "4.0" and (sys_platform == "win32" or platform_system == "Windows" or python_version < "3.13" or os_name == "nt") comm==0.2.2 ; python_version >= "3.10" and python_version < "3.13" @@ -43,7 +43,6 @@ coverage[toml]==7.6.1 ; python_version >= "3.10" and python_version < "4.0" crashtest==0.4.1 ; python_version >= "3.10" and python_version < "4.0" cryptography==42.0.8 ; python_version >= "3.10" and python_version < "4.0" cycler==0.12.1 ; python_version >= "3.10" and python_version < "3.13" -dask[array,dataframe]==2024.2.1 ; python_version >= "3.10" and python_version < "4.0" datafusion==39.0.0 ; python_version >= "3.10" and python_version < "4.0" db-dtypes==1.2.0 ; python_version >= "3.10" and python_version < "4.0" debugpy==1.8.3 ; python_version >= "3.10" and python_version < "3.13" @@ -126,7 +125,6 @@ jupyterlite-core==0.3.0 ; python_version >= "3.10" and python_version < "3.13" jupyterlite-pyodide-kernel==0.3.2 ; python_version >= "3.10" and python_version < "3.13" keyring==24.3.1 ; python_version >= "3.10" and python_version < "4.0" kiwisolver==1.4.5 ; python_version >= "3.10" and python_version < "3.13" -locket==1.0.0 ; python_version >= "3.10" and python_version < "4.0" lonboard==0.9.3 ; python_version >= "3.10" and python_version < "3.13" lz4==4.3.3 ; python_version >= "3.10" and python_version < "4.0" markdown-it-py==3.0.0 ; python_version >= "3.10" and python_version < "4.0" @@ -157,7 +155,6 @@ pandas[numpy]==2.2.2 ; python_version >= "3.10" and python_version < "4.0" pandocfilters==1.5.1 ; python_version >= "3.10" and python_version < "3.13" parso==0.8.4 ; python_version >= "3.10" and python_version < "4.0" parsy==2.1 ; python_version >= "3.10" and python_version < "4.0" -partd==1.4.2 ; python_version >= "3.10" and python_version < "4.0" pathspec==0.12.1 ; python_version >= "3.10" and python_version < "4.0" patsy==0.5.6 ; python_version >= "3.10" and python_version < "3.13" pexpect==4.9.0 ; python_version >= "3.10" and python_version < "4.0"