Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.12' into pylibcudf/i…
Browse files Browse the repository at this point in the history
…o/text
  • Loading branch information
mroeschke committed Nov 4, 2024
2 parents b06d64e + 45563b3 commit aa45811
Show file tree
Hide file tree
Showing 37 changed files with 2,834 additions and 2,681 deletions.
28 changes: 21 additions & 7 deletions python/cudf/cudf/pandas/_wrappers/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,27 @@ def _pandas_util_dir():
# In pandas 2.0, pandas.util contains public APIs under
# __getattr__ but no __dir__ to find them
# https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/util/__init__.py
return list(importlib.import_module("pandas.util").__dict__.keys()) + [
"hash_array",
"hash_pandas_object",
"Appender",
"Substitution",
"cache_readonly",
]
res = list(
set(
list(importlib.import_module("pandas.util").__dict__.keys())
+ [
"Appender",
"Substitution",
"_exceptions",
"_print_versions",
"cache_readonly",
"hash_array",
"hash_pandas_object",
"version",
"_tester",
"_validators",
"_decorators",
]
)
)
if cudf.core._compat.PANDAS_GE_220:
res.append("capitalize_first_letter")
return res


pd.util.__dir__ = _pandas_util_dir
Expand Down
18 changes: 18 additions & 0 deletions python/cudf/cudf_pandas_tests/test_cudf_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1759,3 +1759,21 @@ def test_fallback_raises_error(monkeypatch):
monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True")
with pytest.raises(ProxyFallbackError):
pd.Series(range(2)).astype(object)


@pytest.mark.parametrize(
"attrs",
[
"_exceptions",
"version",
"_print_versions",
"capitalize_first_letter",
"_validators",
"_decorators",
],
)
def test_cudf_pandas_util_version(attrs):
if not PANDAS_GE_220 and attrs == "capitalize_first_letter":
assert not hasattr(pd.util, attrs)
else:
assert hasattr(pd.util, attrs)
51 changes: 33 additions & 18 deletions python/dask_cudf/dask_cudf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from dask import config

# For dask>2024.2.0, we can silence the loud deprecation
# warning before importing `dask.dataframe` (this won't
# do anything for dask==2024.2.0)
config.set({"dataframe.query-planning-warning": False})
import warnings
from importlib import import_module

import dask.dataframe as dd # noqa: E402
from dask import config
import dask.dataframe as dd
from dask.dataframe import from_delayed # noqa: E402

import cudf # noqa: E402

from . import backends # noqa: E402, F401
from ._version import __git_commit__, __version__ # noqa: E402, F401
from .core import concat, from_cudf, from_dask_dataframe # noqa: E402
from .expr import QUERY_PLANNING_ON # noqa: E402
from .core import concat, from_cudf, DataFrame, Index, Series # noqa: F401

QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED


def read_csv(*args, **kwargs):
Expand All @@ -38,34 +36,51 @@ def read_parquet(*args, **kwargs):
return dd.read_parquet(*args, **kwargs)


def raise_not_implemented_error(attr_name):
def _deprecated_api(old_api, new_api=None, rec=None):
def inner_func(*args, **kwargs):
if new_api:
# Use alternative
msg = f"{old_api} is now deprecated. "
msg += rec or f"Please use {new_api} instead."
warnings.warn(msg, FutureWarning)
new_attr = new_api.split(".")
module = import_module(".".join(new_attr[:-1]))
return getattr(module, new_attr[-1])(*args, **kwargs)

# No alternative - raise an error
raise NotImplementedError(
f"Top-level {attr_name} API is not available for dask-expr."
f"{old_api} is no longer supported. " + (rec or "")
)

return inner_func


if QUERY_PLANNING_ON:
from .expr._collection import DataFrame, Index, Series
from ._expr.expr import _patch_dask_expr
from . import io # noqa: F401

groupby_agg = raise_not_implemented_error("groupby_agg")
groupby_agg = _deprecated_api("dask_cudf.groupby_agg")
read_text = DataFrame.read_text
to_orc = raise_not_implemented_error("to_orc")
_patch_dask_expr()

else:
from .core import DataFrame, Index, Series # noqa: F401
from .groupby import groupby_agg # noqa: F401
from .io import read_text, to_orc # noqa: F401
from ._legacy.groupby import groupby_agg # noqa: F401
from ._legacy.io import read_text # noqa: F401
from . import io # noqa: F401


to_orc = _deprecated_api(
"dask_cudf.to_orc",
new_api="dask_cudf._legacy.io.to_orc",
rec="Please use DataFrame.to_orc instead.",
)


__all__ = [
"DataFrame",
"Series",
"Index",
"from_cudf",
"from_dask_dataframe",
"concat",
"from_delayed",
]
Expand Down
1 change: 1 addition & 0 deletions python/dask_cudf/dask_cudf/_expr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,6 @@


class CudfFrameBase(FrameBase):
def to_dask_dataframe(self, **kwargs):
"""Create a dask.dataframe object from a dask_cudf object
WARNING: This API is deprecated, and may not work properly.
Please use `*.to_backend("pandas")` to convert the
underlying data to pandas.
"""

warnings.warn(
"The `to_dask_dataframe` API is now deprecated. "
"Please use `*.to_backend('pandas')` instead.",
FutureWarning,
)

return self.to_backend("pandas", **kwargs)

def _prepare_cov_corr(self, min_periods, numeric_only):
# Upstream version of this method sets min_periods
# to 2 by default (which is not supported by cudf)
Expand Down Expand Up @@ -94,7 +78,7 @@ def var(
def rename_axis(
self, mapper=no_default, index=no_default, columns=no_default, axis=0
):
from dask_cudf.expr._expr import RenameAxisCudf
from dask_cudf._expr.expr import RenameAxisCudf

return new_collection(
RenameAxisCudf(
Expand Down Expand Up @@ -136,7 +120,7 @@ def groupby(
dropna=None,
**kwargs,
):
from dask_cudf.expr._groupby import GroupBy
from dask_cudf._expr.groupby import GroupBy

if isinstance(by, FrameBase) and not isinstance(by, DXSeries):
raise ValueError(
Expand Down Expand Up @@ -169,33 +153,36 @@ def groupby(
)

def to_orc(self, *args, **kwargs):
return self.to_legacy_dataframe().to_orc(*args, **kwargs)
from dask_cudf._legacy.io import to_orc

return to_orc(self, *args, **kwargs)
# return self.to_legacy_dataframe().to_orc(*args, **kwargs)

@staticmethod
def read_text(*args, **kwargs):
from dask_expr import from_legacy_dataframe

from dask_cudf.io.text import read_text as legacy_read_text
from dask_cudf._legacy.io.text import read_text as legacy_read_text

ddf = legacy_read_text(*args, **kwargs)
return from_legacy_dataframe(ddf)


class Series(DXSeries, CudfFrameBase):
def groupby(self, by, **kwargs):
from dask_cudf.expr._groupby import SeriesGroupBy
from dask_cudf._expr.groupby import SeriesGroupBy

return SeriesGroupBy(self, by, **kwargs)

@cached_property
def list(self):
from dask_cudf.accessors import ListMethods
from dask_cudf._expr.accessors import ListMethods

return ListMethods(self)

@cached_property
def struct(self):
from dask_cudf.accessors import StructMethods
from dask_cudf._expr.accessors import StructMethods

return StructMethods(self)

Expand Down
Loading

0 comments on commit aa45811

Please sign in to comment.