diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e66e866c4f4..1d43e1b32a8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,7 @@ repos: rev: 'v1.13.0' hooks: - id: mypy - additional_dependencies: [types-cachetools] + additional_dependencies: [types-cachetools, pyarrow-stubs] args: ["--config-file=pyproject.toml", "python/cudf/cudf", "python/custreamz/custreamz", diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 48644518177..21e59fc1685 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -457,6 +457,13 @@ def _generate_namespaces(namespaces): _intersphinx_extra_prefixes = ("rmm", "rmm::mr", "mr") +_external_intersphinx_aliases = { + "pandas": "pd", + "pyarrow": "pa", + "numpy": "np", + "cupy": "cp", +} + def _cached_intersphinx_lookup(env, node, contnode): """Perform an intersphinx lookup and cache the result. @@ -516,6 +523,17 @@ def on_missing_reference(app, env, node, contnode): # generates. Adding those would clutter the Sphinx output. return contnode + if node["refdomain"] == "py" and reftarget is not None: + # These replacements are needed because of + # https://github.com/sphinx-doc/sphinx/issues/10151 + for module, alias in _external_intersphinx_aliases.items(): + if f"{alias}." in node["reftarget"]: + node["reftarget"] = node["reftarget"].replace(alias, module) + if ( + ref := _cached_intersphinx_lookup(env, node, contnode) + ) is not None: + return ref + if node["refdomain"] in ("std", "cpp") and reftarget is not None: if any(toskip in reftarget for toskip in _names_to_skip_in_cpp): return contnode @@ -582,18 +600,11 @@ def on_missing_reference(app, env, node, contnode): # https://github.com/sphinx-doc/sphinx/issues/11225 nitpick_ignore = [ ("py:class", "Dtype"), - ("py:class", "cp.ndarray"), - ("py:class", "pd.DataFrame"), ("py:class", "pandas.core.indexes.frozen.FrozenList"), - ("py:class", "pa.Array"), - ("py:class", "pa.Table"), - ("py:class", "pa.ListType"), - ("py:class", "pa.Decimal128Type"), ("py:class", "ScalarLike"), ("py:class", "StringColumn"), ("py:class", "ColumnLike"), ("py:class", "DtypeObj"), - ("py:class", "pa.StructType"), ("py:class", "ArrowLike"), ] diff --git a/python/cudf/cudf/core/accessors/string.py b/python/cudf/cudf/core/accessors/string.py index 46a76c2a34f..a62e062590c 100644 --- a/python/cudf/cudf/core/accessors/string.py +++ b/python/cudf/cudf/core/accessors/string.py @@ -970,7 +970,7 @@ def replace( if regex: result = self._column.replace_re( list(pat), - as_column(repl, dtype=CUDF_STRING_DTYPE), + as_column(repl, dtype=CUDF_STRING_DTYPE), # type: ignore[arg-type] ) else: result = self._column.replace_multiple( diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 0e8e93af88a..d79a3940a3f 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -5,13 +5,12 @@ from typing import TYPE_CHECKING import cupy as cp -import pyarrow as pa import cudf from cudf.core.column import as_column from cudf.core.dtypes import CategoricalDtype from cudf.options import get_option -from cudf.utils.dtypes import can_convert_to_column, cudf_dtype_to_pa_type +from cudf.utils.dtypes import can_convert_to_column if TYPE_CHECKING: from cudf.core.index import Index @@ -98,10 +97,8 @@ def factorize( warnings.warn("size_hint is not applicable for cudf.factorize") if use_na_sentinel: - na_sentinel = pa.scalar(-1) cats = values.dropna() else: - na_sentinel = pa.scalar(None, type=cudf_dtype_to_pa_type(values.dtype)) cats = values cats = cats.unique().astype(values.dtype) @@ -111,7 +108,6 @@ def factorize( labels = values._label_encoding( cats=cats, - na_sentinel=na_sentinel, dtype="int64" if get_option("mode.pandas_compatible") else None, ).values diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index cf025fc9627..c62a3925e16 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -277,7 +277,20 @@ def _reduce( ) def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: - other = self._normalize_binop_operand(other) + if isinstance(other, column.ColumnBase): + if ( + isinstance(other, CategoricalColumn) + and other.dtype != self.dtype + ): + raise TypeError( + "Categoricals can only compare with the same type" + ) + # We'll compare self's decategorized values later for non-CategoricalColumn + else: + codes = column.as_column( + self._encode(other), length=len(self), dtype=self.codes.dtype + ) + other = codes._with_type_metadata(self.dtype) equality_ops = {"__eq__", "__ne__", "NULL_EQUALS", "NULL_NOT_EQUALS"} if not self.ordered and op not in equality_ops: raise TypeError( @@ -299,23 +312,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: return self._get_decategorized_column()._binaryop(other, op) return self.codes._binaryop(other.codes, op) - def _normalize_binop_operand( - self, other: ColumnBinaryOperand - ) -> column.ColumnBase: - if isinstance(other, column.ColumnBase): - if not isinstance(other, CategoricalColumn): - # We'll compare self's decategorized values later - return other - if other.dtype != self.dtype: - raise TypeError( - "Categoricals can only compare with the same type" - ) - return other - codes = column.as_column( - self._encode(other), length=len(self), dtype=self.codes.dtype - ) - return codes._with_type_metadata(self.dtype) - def sort_values(self, ascending: bool = True, na_position="last") -> Self: return self.codes.sort_values( # type: ignore[return-value] ascending, na_position @@ -386,7 +382,8 @@ def to_arrow(self) -> pa.Array: return pa.DictionaryArray.from_arrays( self.codes.astype(signed_type).to_arrow(), self.categories.to_arrow(), - ordered=self.ordered, + # TODO: Investigate if self.ordered can actually be None here + ordered=self.ordered if self.ordered is not None else False, ) def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 20415d9b6b7..c0f42db030e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -815,7 +815,7 @@ def to_arrow(self) -> pa.Array: return self.to_pylibcudf(mode="read").to_arrow() @classmethod - def from_arrow(cls, array: pa.Array) -> ColumnBase: + def from_arrow(cls, array: pa.Array | pa.ChunkedArray) -> ColumnBase: """ Convert PyArrow Array/ChunkedArray to column @@ -857,16 +857,23 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: if pa.types.is_dictionary(array.type): if isinstance(array, pa.Array): - codes = array.indices - dictionary = array.dictionary + dict_array = cast(pa.DictionaryArray, array) + codes: pa.Array | pa.ChunkedArray = dict_array.indices + dictionary: pa.Array | pa.ChunkedArray = dict_array.dictionary else: codes = pa.chunked_array( - [chunk.indices for chunk in array.chunks], + [ + cast(pa.DictionaryArray, chunk).indices + for chunk in array.chunks + ], type=array.type.index_type, ) dictionary = pc.unique( pa.chunked_array( - [chunk.dictionary for chunk in array.chunks], + [ + cast(pa.DictionaryArray, chunk).dictionary + for chunk in array.chunks + ], type=array.type.value_type, ) ) @@ -1171,11 +1178,6 @@ def __setitem__(self, key: Any, value: Any) -> None: if out: self._mimic_inplace(out, inplace=True) - def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase: - if is_na_like(other): - return pa.scalar(None, type=cudf_dtype_to_pa_type(self.dtype)) - return NotImplemented - def _all_bools_with_nulls( self, other: ColumnBase, bool_fill_value: bool ) -> ColumnBase: @@ -2089,7 +2091,6 @@ def _label_encoding( self, cats: ColumnBase, dtype: Dtype | None = None, - na_sentinel: pa.Scalar | None = None, ) -> NumericalColumn: """ Convert each value in `self` into an integer code, with `cats` @@ -2120,8 +2121,7 @@ def _label_encoding( ] dtype: int8 """ - if na_sentinel is None or not na_sentinel.is_valid: - na_sentinel = pa.scalar(-1) + na_sentinel = pa.scalar(-1) def _return_sentinel_column(): return as_column(na_sentinel, dtype=dtype, length=len(self)) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 6edebe8498c..4060b7b0cb6 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -8,7 +8,7 @@ import re import warnings from locale import nl_langinfo -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Literal, cast import numpy as np import pandas as pd @@ -528,11 +528,11 @@ def as_string_column(self, dtype) -> StringColumn: def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: reflect, op = self._check_reflected_op(op) + if isinstance(other, cudf.DateOffset): + return other._datetime_binop(self, op, reflect=reflect) # type: ignore[attr-defined] other = self._normalize_binop_operand(other) if other is NotImplemented: return NotImplemented - elif isinstance(other, cudf.DateOffset): - return other._datetime_binop(self, op, reflect=reflect) # type: ignore[attr-defined] if reflect: lhs = other @@ -541,7 +541,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: lhs_unit = lhs.type.unit other_dtype = cudf_dtype_from_pa_type(lhs.type) else: - lhs_unit = lhs.time_unit # type: ignore[union-attr] + lhs_unit = lhs.time_unit # type: ignore[attr-defined] other_dtype = lhs.dtype rhs_unit = rhs.time_unit else: @@ -813,9 +813,9 @@ def to_pandas( ) def to_arrow(self) -> pa.Array: - return pa.compute.assume_timezone( - self._local_time.to_arrow(), str(self.dtype.tz) - ) + # Cast to expected timestamp array type for assume_timezone + local_array = cast(pa.TimestampArray, self._local_time.to_arrow()) + return pa.compute.assume_timezone(local_array, str(self.dtype.tz)) @functools.cached_property def time_unit(self) -> str: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 7a63ca061b1..888cf1d9994 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -35,6 +35,7 @@ pyarrow_dtype_to_cudf_dtype, ) from cudf.utils.scalar import pa_scalar_to_plc_scalar +from cudf.utils.utils import is_na_like if TYPE_CHECKING: from typing_extensions import Self @@ -99,13 +100,16 @@ def __cuda_array_interface__(self): @classmethod def _from_32_64_arrow( cls, - data: pa.Array, + data: pa.Array | pa.ChunkedArray, *, view_type: Literal["int32", "int64"], plc_type: plc.TypeId, step: int, ) -> Self: # Can remove when pyarrow 19 is the minimum version + # Handle ChunkedArray by combining chunks first + if isinstance(data, pa.ChunkedArray): + data = data.combine_chunks() mask_buf, data_buf = data.buffers() rmm_data_buffer = rmm.DeviceBuffer.to_device( np.frombuffer(data_buf) @@ -208,8 +212,32 @@ def __rtruediv__(self, other): def _binaryop(self, other: ColumnBinaryOperand, op: str): reflect, op = self._check_reflected_op(op) - other, other_cudf_dtype = self._normalize_binop_operand(other) # type: ignore[assignment] - if other is NotImplemented: + + # Inline _normalize_binop_operand functionality + if isinstance(other, ColumnBase): + if not isinstance(other, NumericalBaseColumn): + return NotImplemented + elif other.dtype.kind in {"f", "b"}: + raise TypeError( + "Decimal columns only support binary operations with " + "integer numerical columns." + ) + elif other.dtype.kind in {"i", "u"}: + other = other.astype( + type(self.dtype)(self.dtype.MAX_PRECISION, 0) + ) + elif not isinstance(self.dtype, other.dtype.__class__): + # This branch occurs if we have a DecimalBaseColumn of a + # different size (e.g. 64 instead of 32). + if _same_precision_and_scale(self.dtype, other.dtype): + other = other.astype(self.dtype) + other_cudf_dtype = other.dtype + elif isinstance(other, (int, Decimal)): + other_cudf_dtype = self.dtype._from_decimal(Decimal(other)) + elif is_na_like(other): + other = pa.scalar(None, type=cudf_dtype_to_pa_type(self.dtype)) + other_cudf_dtype = self.dtype + else: return NotImplemented if reflect: lhs_dtype = other_cudf_dtype @@ -311,33 +339,6 @@ def _validate_fillna_value( "integer values" ) - def _normalize_binop_operand( - self, other: Any - ) -> tuple[int | Decimal | ColumnBase, DecimalDtype]: - # TODO: Once pyarrow 19 is the minimum version, we can remove the - # passing the DecimalDtype since pyarrow scalars support decimal32/64 types - if isinstance(other, ColumnBase): - if not isinstance(other, NumericalBaseColumn): - return NotImplemented, self.dtype - elif other.dtype.kind in "fb": - raise TypeError( - "Decimal columns only support binary operations with " - "integer numerical columns." - ) - elif other.dtype.kind in "iu": - other = other.astype( - type(self.dtype)(self.dtype.MAX_PRECISION, 0) - ) - elif not isinstance(self.dtype, other.dtype.__class__): - # This branch occurs if we have a DecimalBaseColumn of a - # different size (e.g. 64 instead of 32). - if _same_precision_and_scale(self.dtype, other.dtype): - other = other.astype(self.dtype) - return other, other.dtype - elif isinstance(other, (int, Decimal)): - return other, self.dtype._from_decimal(Decimal(other)) - return super()._normalize_binop_operand(other), self.dtype - def as_numerical_column(self, dtype: np.dtype) -> NumericalColumn: return self.cast(dtype=dtype) # type: ignore[return-value] @@ -366,7 +367,7 @@ def __init__( ) @classmethod - def from_arrow(cls, data: pa.Array) -> Self: + def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self: return cls._from_32_64_arrow( data, view_type="int32", plc_type=plc.TypeId.DECIMAL32, step=4 ) @@ -398,7 +399,8 @@ def to_arrow(self) -> pa.Array: type=self.dtype.to_arrow(), offset=self._offset, length=self.size, - buffers=[mask_buf, data_buf], + # PyArrow stubs are too strict - from_buffers should accept None for missing buffers + buffers=[mask_buf, data_buf], # type: ignore[list-item] ) def _with_type_metadata( @@ -442,7 +444,7 @@ def __init__( ) @classmethod - def from_arrow(cls, data: pa.Array) -> Self: + def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self: result = cast(Decimal128Dtype, super().from_arrow(data)) result.dtype.precision = data.type.precision return result @@ -490,7 +492,7 @@ def __init__( ) @classmethod - def from_arrow(cls, data: pa.Array) -> Self: + def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self: return cls._from_32_64_arrow( data, view_type="int64", plc_type=plc.TypeId.DECIMAL64, step=2 ) @@ -516,7 +518,8 @@ def to_arrow(self) -> pa.Array: type=self.dtype.to_arrow(), offset=self._offset, length=self.size, - buffers=[mask_buf, data_buf], + # PyArrow stubs are too strict - from_buffers should accept None for missing buffers + buffers=[mask_buf, data_buf], # type: ignore[list-item] ) def _with_type_metadata( diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 4fae8d77d88..b6ef7089a9f 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -58,9 +58,13 @@ def _validate_dtype_instance(dtype: IntervalDtype) -> IntervalDtype: return dtype @classmethod - def from_arrow(cls, data: pa.Array) -> Self: - new_col = super().from_arrow(data.storage) - return new_col._with_type_metadata(IntervalDtype.from_arrow(data.type)) # type: ignore[return-value] + def from_arrow(cls, array: pa.Array | pa.ChunkedArray) -> Self: + if not isinstance(array, pa.ExtensionArray): + raise ValueError("Expected ExtensionArray for interval data") + new_col = super().from_arrow(array.storage) + return new_col._with_type_metadata( + IntervalDtype.from_arrow(array.type) + ) # type: ignore[return-value] def to_arrow(self) -> pa.Array: typ = self.dtype.to_arrow() diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index a29fb5eae13..73a5e75ab42 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -152,8 +152,7 @@ def base_size(self) -> int: def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: # Lists only support __add__, which concatenates lists. reflect, op = self._check_reflected_op(op) - other = self._normalize_binop_operand(other) - if other is NotImplemented: + if not isinstance(other, type(self)): return NotImplemented if isinstance(other.dtype, ListDtype): if op == "__add__": @@ -192,11 +191,15 @@ def to_arrow(self) -> pa.Array: if self.nullable: nbuf = pa.py_buffer(self.mask.memoryview()) # type: ignore[union-attr] - buffers = (nbuf, offsets.buffers()[1]) + buffers = [nbuf, offsets.buffers()[1]] else: - buffers = offsets.buffers() + buffers = list(offsets.buffers()) return pa.ListArray.from_buffers( - pa_type, len(self), buffers, children=[elements] + pa_type, + len(self), + # PyArrow stubs are too strict - from_buffers should accept None for missing buffers + buffers, # type: ignore[arg-type] + children=[elements], ) def set_base_data(self, value): @@ -218,11 +221,6 @@ def __cuda_array_interface__(self): "Lists are not yet supported via `__cuda_array_interface__`" ) - def _normalize_binop_operand(self, other: Any) -> ColumnBase: - if isinstance(other, type(self)): - return other - return NotImplemented - def _with_type_metadata(self: Self, dtype: Dtype) -> Self: if isinstance(dtype, ListDtype): elements = self.base_children[1]._with_type_metadata( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index c0ebc3343a1..948b71ce296 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -409,12 +409,20 @@ def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase: if not isinstance(other, type(self)): return NotImplemented return other - elif isinstance(other, (cp.ndarray, np.ndarray)) and other.ndim == 0: + # TODO: cupy scalars are just aliases for numpy scalars, so extracting a scalar + # from a cupy array would always require a D2H copy. As a result, cupy does not + # produce scalars without explicit casting requests + # https://docs.cupy.dev/en/stable/user_guide/difference.html#zero-dimensional-array + # The below logic for type inference relies on numpy, however, so we need to go + # that route for now. If possible we should find a way to avoid this. + if isinstance(other, cp.ndarray) and other.ndim == 0: + other = cp.asnumpy(other)[()] + elif isinstance(other, np.ndarray) and other.ndim == 0: other = other[()] if is_scalar(other): if is_na_like(other): - return super()._normalize_binop_operand(other) + return pa.scalar(None, type=cudf_dtype_to_pa_type(self.dtype)) if not isinstance(other, (int, float, complex)): # Go via NumPy to get the value other = np.array(other) @@ -444,7 +452,6 @@ def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase: self.dtype, np.result_type(self.dtype.numpy_dtype, other), # noqa: TID251 ) - else: common_dtype = np.result_type(self.dtype, other) # noqa: TID251 if common_dtype.kind in {"b", "i", "u", "f"}: # type: ignore[union-attr] diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9b4b46eb837..c3eadc3bd14 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4,7 +4,7 @@ import itertools from functools import cached_property -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -23,6 +23,7 @@ from cudf.utils.dtypes import ( CUDF_STRING_DTYPE, SIZE_TYPE_DTYPE, + cudf_dtype_to_pa_type, dtype_to_pylibcudf_type, get_dtype_of_same_kind, is_dtype_obj_string, @@ -517,15 +518,6 @@ def find_and_replace( res = self return res.replace(df._data["old"], df._data["new"]) - def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase: - if is_scalar(other): - if is_na_like(other): - return super()._normalize_binop_operand(other) - return pa.scalar(other) - elif isinstance(other, type(self)): - return other - return NotImplemented - def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: reflect, op = self._check_reflected_op(op) # Due to https://github.com/pandas-dev/pandas/issues/46332 we need to @@ -551,8 +543,12 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: elif op == "__ne__": return self.isnull() - other = self._normalize_binop_operand(other) - if other is NotImplemented: + if is_scalar(other): + if is_na_like(other): + other = pa.scalar(None, type=cudf_dtype_to_pa_type(self.dtype)) + else: + other = pa.scalar(other) # type: ignore[arg-type] + elif not isinstance(other, type(self)): return NotImplemented if isinstance(other, (StringColumn, pa.Scalar)): diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 4613747865c..a7ffed6e0bf 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -118,9 +118,10 @@ def to_arrow(self) -> pa.Array: ) if self.mask is not None: - buffers = (pa.py_buffer(self.mask.memoryview()),) + buffers = [pa.py_buffer(self.mask.memoryview())] else: - buffers = (None,) + # PyArrow stubs are too strict - from_buffers should accept None for missing buffers + buffers = [None] # type: ignore[list-item] return pa.StructArray.from_buffers( pa_type, len(self), buffers, children=children diff --git a/python/cudf/cudf/core/column/temporal_base.py b/python/cudf/cudf/core/column/temporal_base.py index c413591f15a..344c9004411 100644 --- a/python/cudf/cudf/core/column/temporal_base.py +++ b/python/cudf/cudf/core/column/temporal_base.py @@ -131,14 +131,12 @@ def _process_values_for_isin( def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase: if isinstance(other, ColumnBase): return other - elif self.dtype.kind == "M" and isinstance(other, cudf.DateOffset): - return other elif isinstance(other, (cp.ndarray, np.ndarray)) and other.ndim == 0: other = other[()] if is_scalar(other): if is_na_like(other): - return super()._normalize_binop_operand(other) + return pa.scalar(None, type=cudf_dtype_to_pa_type(self.dtype)) elif self.dtype.kind == "M" and isinstance(other, pd.Timestamp): if other.tz is not None: raise NotImplementedError( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index afc46e5ace3..27140a95e8d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5892,7 +5892,8 @@ def to_arrow(self, preserve_index: bool | None = None) -> pa.Table: ) out = super(DataFrame, data).to_arrow() - metadata = pa.pandas_compat.construct_metadata( + # PyArrow stubs don't recognize pandas_compat attribute + metadata = pa.pandas_compat.construct_metadata( # type: ignore[attr-defined] columns_to_convert=[self[col] for col in self._column_names], df=self, column_names=out.schema.names, diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 3eae1b8e84c..6a49987d5d4 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -662,7 +662,8 @@ def to_arrow(self) -> pa.StructType: StructType(struct) """ return pa.struct( - { + # dict[str, DataType] should be compatible but pyarrow stubs are too strict + { # type: ignore[arg-type] k: cudf_dtype_to_pa_type(dtype) for k, dtype in self.fields.items() } @@ -851,7 +852,11 @@ def to_arrow(self) -> pa.Decimal128Type: return pa.decimal128(self.precision, self.scale) @classmethod - def from_arrow(cls, typ: pa.Decimal128Type) -> Self: + def from_arrow( + cls, typ: pa.Decimal32Type | pa.Decimal64Type | pa.Decimal128Type + ) -> Self: + # TODO: Eventually narrow this to only accept the appropriate decimal type + # for each specific DecimalNDtype subclass """ Construct a cudf decimal dtype from a ``pyarrow`` dtype diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6f39c567666..f1fb883b5ae 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -9,7 +9,6 @@ from typing import TYPE_CHECKING, Any, Literal import cupy -import numpy import numpy as np import pyarrow as pa from typing_extensions import Self @@ -552,13 +551,13 @@ def _to_array( copy: bool, dtype: Dtype | None = None, na_value=no_default, - ) -> cupy.ndarray | numpy.ndarray: + ) -> cupy.ndarray | np.ndarray: # Internal function to implement to_cupy and to_numpy, which are nearly # identical except for the attribute they access to generate values. def to_array( col: ColumnBase, to_dtype: np.dtype - ) -> cupy.ndarray | numpy.ndarray: + ) -> cupy.ndarray | np.ndarray: if ( col.has_nulls() and dtype is not None @@ -610,7 +609,7 @@ def to_array( if ncol == 0: return module.empty( shape=(len(self), ncol), - dtype=numpy.dtype("float64"), + dtype=np.dtype("float64"), order="F", ) @@ -633,9 +632,9 @@ def to_array( "Cannot convert to cupy bool array with nulls." ) else: - to_dtype = numpy.dtype("object") + to_dtype = np.dtype("object") elif to_dtype.kind in "ui": - to_dtype = numpy.dtype("float64") + to_dtype = np.dtype("float64") if cudf.get_option( "mode.pandas_compatible" @@ -646,7 +645,7 @@ def to_array( if isinstance(to_dtype, cudf.CategoricalDtype): to_dtype = to_dtype.categories.dtype - if not isinstance(to_dtype, numpy.dtype): + if not isinstance(to_dtype, np.dtype): raise NotImplementedError( f"{to_dtype} cannot be exposed as an array" ) @@ -800,7 +799,7 @@ def to_numpy( dtype: Dtype | None = None, copy: bool = True, na_value=no_default, - ) -> numpy.ndarray: + ) -> np.ndarray: """Convert the Frame to a NumPy array. Parameters @@ -826,7 +825,7 @@ def to_numpy( ) return self._to_array( - lambda col: col.values_host, numpy, copy, dtype, na_value + lambda col: col.values_host, np, copy, dtype, na_value ) @_performance_tracking diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 46e712572bd..177eacb0276 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -11,10 +11,7 @@ from functools import cached_property, singledispatch from typing import TYPE_CHECKING, Any, Literal -# Needed to make Sphinx happy for typing purposes -import cupy import cupy as cp -import numpy import numpy as np import pandas as pd import pyarrow as pa @@ -582,7 +579,7 @@ def groups(self): ) @cached_property - def indices(self) -> dict[ScalarLike, cupy.ndarray]: + def indices(self) -> dict[ScalarLike, cp.ndarray]: """ Dict {group name -> group indices}. @@ -1473,7 +1470,7 @@ def sample( frac: float | None = None, replace: bool = False, weights: Sequence | Series | None = None, - random_state: numpy.random.RandomState | int | None = None, + random_state: np.random.RandomState | int | None = None, ): """Return a random sample of items in each group. diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index dadd8e3a86b..e5c9fb0428a 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -25,8 +25,7 @@ if TYPE_CHECKING: from collections.abc import Hashable - import cupy - import numpy + import numpy as np import pyarrow as pa from cudf._typing import Dtype, NotImplementedType, ScalarLike @@ -104,7 +103,7 @@ def _column(self) -> ColumnBase: @property # type: ignore @_performance_tracking - def values(self) -> cupy.ndarray: + def values(self) -> cp.ndarray: col = self._column if col.dtype.kind in {"i", "u", "f", "b"} and not col.has_nulls(): return cp.asarray(col) @@ -120,7 +119,7 @@ def to_cupy( dtype: Dtype | None = None, copy: bool = False, na_value=None, - ) -> cupy.ndarray: + ) -> cp.ndarray: """ Convert the SingleColumnFrame (e.g., Series) to a CuPy array. @@ -149,7 +148,7 @@ def to_cupy( @property # type: ignore @_performance_tracking - def values_host(self) -> numpy.ndarray: + def values_host(self) -> np.ndarray: return self._column.values_host @classmethod @@ -281,7 +280,7 @@ def __cuda_array_interface__(self): @_performance_tracking def factorize( self, sort: bool = False, use_na_sentinel: bool = True - ) -> tuple[cupy.ndarray, Index]: + ) -> tuple[cp.ndarray, Index]: """Encode the input values as integer labels. Parameters diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index eea3a465729..9affa8599ac 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1605,7 +1605,18 @@ def to_parquet( index = True pa_table = df.to_arrow(preserve_index=index) - return pq.write_to_dataset( + # Check for conflicting arguments in kwargs + if "root_path" in kwargs: + raise ValueError( + "'root_path' should be passed as 'path' argument to to_parquet(), not in kwargs" + ) + if "partition_cols" in kwargs: + raise ValueError( + "'partition_cols' should be passed directly to to_parquet(), not in kwargs" + ) + # Type ignore: mypy complains about potential duplicate arguments from *args + # but our API design allows passing additional args/kwargs to pyarrow + return pq.write_to_dataset( # type: ignore[misc] pa_table, root_path=path, partition_cols=partition_cols, @@ -2480,7 +2491,7 @@ def _process_metadata( idx = Index._from_column(column_empty(0)) else: start = range_index_meta["start"] + skip_rows # type: ignore[operator] - stop = range_index_meta["stop"] + stop = int(range_index_meta["stop"]) # type: ignore[arg-type] if nrows > -1: stop = start + nrows idx = RangeIndex( diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 1b2595072be..a1338bc778a 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -3,6 +3,7 @@ from __future__ import annotations import warnings +from typing import cast import cupy as cp import numpy as np @@ -43,7 +44,9 @@ def _string_view_to_string_schema(schema: pa.Schema) -> pa.Schema: ) for f in schema ], - metadata=schema.metadata, + # Cast needed because schema.metadata is dict[bytes, bytes] but + # pa.schema expects dict[bytes | str, bytes | str] | None + metadata=cast(dict[bytes | str, bytes | str] | None, schema.metadata), ) diff --git a/python/cudf/cudf/tests/series/test_binops.py b/python/cudf/cudf/tests/series/test_binops.py index ddc63361765..d4f17063f3e 100644 --- a/python/cudf/cudf/tests/series/test_binops.py +++ b/python/cudf/cudf/tests/series/test_binops.py @@ -3087,3 +3087,14 @@ def test_binops_compare_stdlib_date_scalar(comparison_op): result = comparison_op(cudf.Series(data), dt) expected = comparison_op(pd.Series(data), dt) assert_eq(result, expected) + + +@pytest.mark.parametrize("xp", [cp, np]) +def test_singleton_array(binary_op, xp): + # Validate that we handle singleton numpy/cupy arrays appropriately + lhs = cudf.Series([1, 2, 3]) + rhs_device = xp.array(1) + rhs_host = np.array(1) + expect = binary_op(lhs.to_pandas(), rhs_host) + got = binary_op(lhs, rhs_device) + assert_eq(expect, got) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index b10b5f1247c..e957da30185 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -133,6 +133,8 @@ def cudf_dtype_from_pa_type(typ: pa.DataType) -> DtypeObj: elif pa.types.is_struct(typ): return cudf.core.dtypes.StructDtype.from_arrow(typ) elif pa.types.is_decimal(typ): + if isinstance(typ, pa.Decimal256Type): + raise NotImplementedError("cudf does not support Decimal256Type") return cudf.core.dtypes.Decimal128Dtype.from_arrow(typ) elif pa.types.is_large_string(typ) or pa.types.is_string(typ): return CUDF_STRING_DTYPE @@ -397,8 +399,8 @@ def is_dtype_obj_numeric( pa_decimal64type = getattr(pa, "Decimal64Type", None) -def pyarrow_dtype_to_cudf_dtype(dtype: pa.DataType) -> DtypeObj: - """Given a pyarrow dtype, converts it into the equivalent cudf pandas +def pyarrow_dtype_to_cudf_dtype(dtype: pd.ArrowDtype) -> DtypeObj: + """Given a pandas ArrowDtype, converts it into the equivalent cudf pandas dtype. """ diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 47e5ecee570..1fd50c7e66d 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1506,7 +1506,7 @@ def _index_level_name( def generate_pandas_metadata(table: DataFrame, index: bool | None) -> str: col_names: list[Hashable] = [] - types = [] + types: list[pa.DataType] = [] index_levels = [] index_descriptors = [] df_meta = table.head(0) @@ -1577,7 +1577,7 @@ def generate_pandas_metadata(table: DataFrame, index: bool | None) -> str: index_levels.append(idx) index_descriptors.append(descr) - metadata = pa.pandas_compat.construct_metadata( + metadata = pa.pandas_compat.construct_metadata( # type: ignore[attr-defined] columns_to_convert=columns_to_convert, # It is OKAY to do `.to_pandas()` because # this method will extract `.columns` metadata only diff --git a/python/cudf/cudf/utils/scalar.py b/python/cudf/cudf/utils/scalar.py index 0ccb700e035..1d74579629d 100644 --- a/python/cudf/cudf/utils/scalar.py +++ b/python/cudf/cudf/utils/scalar.py @@ -2,7 +2,7 @@ from __future__ import annotations import functools -from typing import Any +from typing import Any, cast import pandas as pd import pyarrow as pa @@ -46,11 +46,14 @@ def maybe_nested_pa_scalar_to_py(pa_scalar: pa.Scalar) -> Any: if not pa_scalar.is_valid: return pd.NA if pa.types.is_struct(pa_scalar.type): + struct_scalar = cast(pa.StructScalar, pa_scalar) return { str(i): maybe_nested_pa_scalar_to_py(val) - for i, (_, val) in enumerate(pa_scalar.items()) + for i, (_, val) in enumerate(struct_scalar.items()) } elif pa.types.is_list(pa_scalar.type): - return [maybe_nested_pa_scalar_to_py(val) for val in pa_scalar] + list_scalar = cast(pa.ListScalar, pa_scalar) + # TODO: Fix pyarrow-stubs typing - ListScalar iteration should yield Scalar objects + return [maybe_nested_pa_scalar_to_py(val) for val in list_scalar] # type: ignore[arg-type] else: return pa_scalar.as_py()