Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
96fbef9
Add back pyarrow-stubs
vyasr Sep 24, 2025
2873de5
Fix pyarrow StructScalar type casting in cudf/utils/scalar.py
vyasr Sep 24, 2025
79271fd
Fix DecimalDtype.from_arrow compatibility and add Decimal256Type check
vyasr Sep 24, 2025
b35729c
Fix pyarrow_dtype_to_cudf_dtype function signature and docstring
vyasr Sep 24, 2025
dadab95
Fix schema metadata type casting in cudf/testing/testing.py
vyasr Sep 24, 2025
6ed69f7
Add type ignore for pyarrow struct creation in cudf/core/dtypes.py
vyasr Sep 24, 2025
3e59d74
Fix DataType list typing and pandas_compat attribute in cudf/utils/io…
vyasr Sep 24, 2025
9ba9b5a
Fix DictionaryArray.from_arrays ordered parameter handling
vyasr Sep 24, 2025
8845521
Fix DictionaryArray type casting in cudf/core/column/column.py
vyasr Sep 24, 2025
b43e327
Add type ignore for ListScalar iteration in cudf/utils/scalar.py
vyasr Sep 25, 2025
8106957
Fix Buffer tuple types in cudf/core/column/struct.py
vyasr Sep 25, 2025
39433d5
Fix Array.storage attribute in cudf/core/column/interval.py
vyasr Sep 25, 2025
4731ed5
Remove ColumnBase._normalize_binop_operand and inline functionality
vyasr Sep 25, 2025
425c8ed
Fix Buffer list compatibility in cudf/core/column/decimal.py
vyasr Sep 25, 2025
841736d
Fix DateOffset return type in cudf/core/column/temporal_base.py
vyasr Sep 25, 2025
5641a23
Fix Buffer list assignment in cudf/core/column/lists.py
vyasr Sep 25, 2025
244fe9b
Fix Buffer list compatibility in cudf/core/column/lists.py
vyasr Sep 25, 2025
8fde16a
Fix time_unit attribute and assume_timezone overload in cudf/core/col…
vyasr Sep 25, 2025
9f6bf8c
Remove na_sentinel parameter from _label_encoding method
vyasr Sep 25, 2025
5322d8f
Fix ChunkedArray compatibility in ColumnBase.from_arrow
vyasr Sep 25, 2025
7930306
Fix pandas_compat attribute in cudf/core/dataframe.py
vyasr Sep 25, 2025
64a4620
Fix subclass from_arrow signatures to match base class
vyasr Sep 25, 2025
586323a
Fix ChunkedArray handling in from_arrow method
vyasr Sep 25, 2025
b8eb8e3
Fix _from_32_64_arrow method to handle ChunkedArray
vyasr Sep 25, 2025
ffff3fc
Fix ChunkedArray type annotations in dictionary handling
vyasr Sep 25, 2025
c9ef94c
Fix StringColumn replace_re argument type in string accessor
vyasr Sep 25, 2025
beb8d56
Fix write_to_dataset duplicate arguments in cudf/io/parquet.py
vyasr Sep 26, 2025
eaad1fa
Fix variable assignment type in cudf/io/parquet.py
vyasr Sep 26, 2025
13a4734
Inline lists.py _normalize_binop_operand function
vyasr Sep 26, 2025
3514afa
Inline string.py _normalize_binop_operand function
vyasr Sep 26, 2025
1c4882e
Inline categorical.py _normalize_binop_operand function
vyasr Sep 26, 2025
d14d6af
Inline numerical.py _normalize_binop_operand function
vyasr Sep 26, 2025
b389893
Inline decimal.py _normalize_binop_operand function
vyasr Sep 26, 2025
81aab7e
Move DateOffset logic to child class to avoid incorrect handling
vyasr Sep 26, 2025
de931cf
Revert "Inline numerical.py _normalize_binop_operand function"
vyasr Sep 26, 2025
d470e4e
Fix cupy array handling and add a test
vyasr Sep 26, 2025
e4da5d7
Enable intersphinx to find import-aliased third-party modules in type…
vyasr Sep 26, 2025
94f53df
Address reviews
vyasr Sep 27, 2025
e11326f
Merge branch 'branch-25.12' into fix/pyarrow_typing
vyasr Sep 29, 2025
f2e5016
Apply suggestions from code review
vyasr Sep 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ repos:
rev: 'v1.13.0'
hooks:
- id: mypy
additional_dependencies: [types-cachetools]
additional_dependencies: [types-cachetools, pyarrow-stubs]
args: ["--config-file=pyproject.toml",
"python/cudf/cudf",
"python/custreamz/custreamz",
Expand Down
25 changes: 18 additions & 7 deletions docs/cudf/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,13 @@ def _generate_namespaces(namespaces):

_intersphinx_extra_prefixes = ("rmm", "rmm::mr", "mr")

_external_intersphinx_aliases = {
"pandas": "pd",
"pyarrow": "pa",
"numpy": "np",
"cupy": "cp",
}


def _cached_intersphinx_lookup(env, node, contnode):
"""Perform an intersphinx lookup and cache the result.
Expand Down Expand Up @@ -516,6 +523,17 @@ def on_missing_reference(app, env, node, contnode):
# generates. Adding those would clutter the Sphinx output.
return contnode

if node["refdomain"] == "py" and reftarget is not None:
# These replacements are needed because of
# https://github.com/sphinx-doc/sphinx/issues/10151
for module, alias in _external_intersphinx_aliases.items():
if f"{alias}." in node["reftarget"]:
node["reftarget"] = node["reftarget"].replace(alias, module)
if (
ref := _cached_intersphinx_lookup(env, node, contnode)
) is not None:
return ref

if node["refdomain"] in ("std", "cpp") and reftarget is not None:
if any(toskip in reftarget for toskip in _names_to_skip_in_cpp):
return contnode
Expand Down Expand Up @@ -582,18 +600,11 @@ def on_missing_reference(app, env, node, contnode):
# https://github.com/sphinx-doc/sphinx/issues/11225
nitpick_ignore = [
("py:class", "Dtype"),
("py:class", "cp.ndarray"),
("py:class", "pd.DataFrame"),
("py:class", "pandas.core.indexes.frozen.FrozenList"),
("py:class", "pa.Array"),
("py:class", "pa.Table"),
("py:class", "pa.ListType"),
("py:class", "pa.Decimal128Type"),
("py:class", "ScalarLike"),
("py:class", "StringColumn"),
("py:class", "ColumnLike"),
("py:class", "DtypeObj"),
("py:class", "pa.StructType"),
("py:class", "ArrowLike"),
]

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/accessors/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,7 @@ def replace(
if regex:
result = self._column.replace_re(
list(pat),
as_column(repl, dtype=CUDF_STRING_DTYPE),
as_column(repl, dtype=CUDF_STRING_DTYPE), # type: ignore[arg-type]
)
else:
result = self._column.replace_multiple(
Expand Down
6 changes: 1 addition & 5 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,12 @@
from typing import TYPE_CHECKING

import cupy as cp
import pyarrow as pa

import cudf
from cudf.core.column import as_column
from cudf.core.dtypes import CategoricalDtype
from cudf.options import get_option
from cudf.utils.dtypes import can_convert_to_column, cudf_dtype_to_pa_type
from cudf.utils.dtypes import can_convert_to_column

if TYPE_CHECKING:
from cudf.core.index import Index
Expand Down Expand Up @@ -98,10 +97,8 @@ def factorize(
warnings.warn("size_hint is not applicable for cudf.factorize")

if use_na_sentinel:
na_sentinel = pa.scalar(-1)
cats = values.dropna()
else:
na_sentinel = pa.scalar(None, type=cudf_dtype_to_pa_type(values.dtype))
cats = values

cats = cats.unique().astype(values.dtype)
Expand All @@ -111,7 +108,6 @@ def factorize(

labels = values._label_encoding(
cats=cats,
na_sentinel=na_sentinel,
dtype="int64" if get_option("mode.pandas_compatible") else None,
).values

Expand Down
35 changes: 16 additions & 19 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,20 @@ def _reduce(
)

def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
other = self._normalize_binop_operand(other)
if isinstance(other, column.ColumnBase):
if (
isinstance(other, CategoricalColumn)
and other.dtype != self.dtype
):
raise TypeError(
"Categoricals can only compare with the same type"
)
# We'll compare self's decategorized values later for non-CategoricalColumn
else:
codes = column.as_column(
self._encode(other), length=len(self), dtype=self.codes.dtype
)
other = codes._with_type_metadata(self.dtype)
equality_ops = {"__eq__", "__ne__", "NULL_EQUALS", "NULL_NOT_EQUALS"}
if not self.ordered and op not in equality_ops:
raise TypeError(
Expand All @@ -299,23 +312,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
return self._get_decategorized_column()._binaryop(other, op)
return self.codes._binaryop(other.codes, op)

def _normalize_binop_operand(
self, other: ColumnBinaryOperand
) -> column.ColumnBase:
if isinstance(other, column.ColumnBase):
if not isinstance(other, CategoricalColumn):
# We'll compare self's decategorized values later
return other
if other.dtype != self.dtype:
raise TypeError(
"Categoricals can only compare with the same type"
)
return other
codes = column.as_column(
self._encode(other), length=len(self), dtype=self.codes.dtype
)
return codes._with_type_metadata(self.dtype)

def sort_values(self, ascending: bool = True, na_position="last") -> Self:
return self.codes.sort_values( # type: ignore[return-value]
ascending, na_position
Expand Down Expand Up @@ -386,7 +382,8 @@ def to_arrow(self) -> pa.Array:
return pa.DictionaryArray.from_arrays(
self.codes.astype(signed_type).to_arrow(),
self.categories.to_arrow(),
ordered=self.ordered,
# TODO: Investigate if self.ordered can actually be None here
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need a corresponding issue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm planning on investigating this in my follow-up PR (in progress) so I'll open an issue if I need to.

ordered=self.ordered if self.ordered is not None else False,
)

def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
Expand Down
26 changes: 13 additions & 13 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,7 +815,7 @@ def to_arrow(self) -> pa.Array:
return self.to_pylibcudf(mode="read").to_arrow()

@classmethod
def from_arrow(cls, array: pa.Array) -> ColumnBase:
def from_arrow(cls, array: pa.Array | pa.ChunkedArray) -> ColumnBase:
"""
Convert PyArrow Array/ChunkedArray to column

Expand Down Expand Up @@ -857,16 +857,23 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:

if pa.types.is_dictionary(array.type):
if isinstance(array, pa.Array):
codes = array.indices
dictionary = array.dictionary
dict_array = cast(pa.DictionaryArray, array)
codes: pa.Array | pa.ChunkedArray = dict_array.indices
dictionary: pa.Array | pa.ChunkedArray = dict_array.dictionary
else:
codes = pa.chunked_array(
[chunk.indices for chunk in array.chunks],
[
cast(pa.DictionaryArray, chunk).indices
for chunk in array.chunks
],
type=array.type.index_type,
)
dictionary = pc.unique(
pa.chunked_array(
[chunk.dictionary for chunk in array.chunks],
[
cast(pa.DictionaryArray, chunk).dictionary
for chunk in array.chunks
],
type=array.type.value_type,
)
)
Expand Down Expand Up @@ -1171,11 +1178,6 @@ def __setitem__(self, key: Any, value: Any) -> None:
if out:
self._mimic_inplace(out, inplace=True)

def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase:
if is_na_like(other):
return pa.scalar(None, type=cudf_dtype_to_pa_type(self.dtype))
return NotImplemented

def _all_bools_with_nulls(
self, other: ColumnBase, bool_fill_value: bool
) -> ColumnBase:
Expand Down Expand Up @@ -2089,7 +2091,6 @@ def _label_encoding(
self,
cats: ColumnBase,
dtype: Dtype | None = None,
na_sentinel: pa.Scalar | None = None,
) -> NumericalColumn:
"""
Convert each value in `self` into an integer code, with `cats`
Expand Down Expand Up @@ -2120,8 +2121,7 @@ def _label_encoding(
]
dtype: int8
"""
if na_sentinel is None or not na_sentinel.is_valid:
na_sentinel = pa.scalar(-1)
na_sentinel = pa.scalar(-1)

def _return_sentinel_column():
return as_column(na_sentinel, dtype=dtype, length=len(self))
Expand Down
14 changes: 7 additions & 7 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import re
import warnings
from locale import nl_langinfo
from typing import TYPE_CHECKING, Literal
from typing import TYPE_CHECKING, Literal, cast

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -528,11 +528,11 @@ def as_string_column(self, dtype) -> StringColumn:

def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
reflect, op = self._check_reflected_op(op)
if isinstance(other, cudf.DateOffset):
return other._datetime_binop(self, op, reflect=reflect) # type: ignore[attr-defined]
other = self._normalize_binop_operand(other)
if other is NotImplemented:
return NotImplemented
elif isinstance(other, cudf.DateOffset):
return other._datetime_binop(self, op, reflect=reflect) # type: ignore[attr-defined]

if reflect:
lhs = other
Expand All @@ -541,7 +541,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
lhs_unit = lhs.type.unit
other_dtype = cudf_dtype_from_pa_type(lhs.type)
else:
lhs_unit = lhs.time_unit # type: ignore[union-attr]
lhs_unit = lhs.time_unit # type: ignore[attr-defined]
other_dtype = lhs.dtype
rhs_unit = rhs.time_unit
else:
Expand Down Expand Up @@ -813,9 +813,9 @@ def to_pandas(
)

def to_arrow(self) -> pa.Array:
return pa.compute.assume_timezone(
self._local_time.to_arrow(), str(self.dtype.tz)
)
# Cast to expected timestamp array type for assume_timezone
local_array = cast(pa.TimestampArray, self._local_time.to_arrow())
return pa.compute.assume_timezone(local_array, str(self.dtype.tz))

@functools.cached_property
def time_unit(self) -> str:
Expand Down
Loading
Loading