rapidsai · rapids-bot · Sep 29, 2025 · Sep 24, 2025 · Sep 24, 2025 · Sep 24, 2025
@@ -33,7 +33,7 @@ repos:
     rev: 'v1.13.0'
     hooks:
       - id: mypy
-        additional_dependencies: [types-cachetools]
+        additional_dependencies: [types-cachetools, pyarrow-stubs]
         args: ["--config-file=pyproject.toml",
                "python/cudf/cudf",
                "python/custreamz/custreamz",

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
@@ -457,6 +457,13 @@ def _generate_namespaces(namespaces):
 
 _intersphinx_extra_prefixes = ("rmm", "rmm::mr", "mr")
 
+_external_intersphinx_aliases = {
+    "pandas": "pd",
+    "pyarrow": "pa",
+    "numpy": "np",
+    "cupy": "cp",
+}
+
 
 def _cached_intersphinx_lookup(env, node, contnode):
     """Perform an intersphinx lookup and cache the result.
@@ -516,6 +523,17 @@ def on_missing_reference(app, env, node, contnode):
         # generates. Adding those would clutter the Sphinx output.
         return contnode
 
+    if node["refdomain"] == "py" and reftarget is not None:
+        # These replacements are needed because of
+        # https://github.com/sphinx-doc/sphinx/issues/10151
+        for module, alias in _external_intersphinx_aliases.items():
+            if f"{alias}." in node["reftarget"]:
+                node["reftarget"] = node["reftarget"].replace(alias, module)
+                if (
+                    ref := _cached_intersphinx_lookup(env, node, contnode)
+                ) is not None:
+                    return ref
+
     if node["refdomain"] in ("std", "cpp") and reftarget is not None:
         if any(toskip in reftarget for toskip in _names_to_skip_in_cpp):
             return contnode
@@ -582,18 +600,11 @@ def on_missing_reference(app, env, node, contnode):
 # https://github.com/sphinx-doc/sphinx/issues/11225
 nitpick_ignore = [
     ("py:class", "Dtype"),
-    ("py:class", "cp.ndarray"),
-    ("py:class", "pd.DataFrame"),
     ("py:class", "pandas.core.indexes.frozen.FrozenList"),
-    ("py:class", "pa.Array"),
-    ("py:class", "pa.Table"),
-    ("py:class", "pa.ListType"),
-    ("py:class", "pa.Decimal128Type"),
     ("py:class", "ScalarLike"),
     ("py:class", "StringColumn"),
     ("py:class", "ColumnLike"),
     ("py:class", "DtypeObj"),
-    ("py:class", "pa.StructType"),
     ("py:class", "ArrowLike"),
 ]
 

@@ -970,7 +970,7 @@ def replace(
             if regex:
                 result = self._column.replace_re(
                     list(pat),
-                    as_column(repl, dtype=CUDF_STRING_DTYPE),
+                    as_column(repl, dtype=CUDF_STRING_DTYPE),  # type: ignore[arg-type]
                 )
             else:
                 result = self._column.replace_multiple(

@@ -5,13 +5,12 @@
 from typing import TYPE_CHECKING
 
 import cupy as cp
-import pyarrow as pa
 
 import cudf
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
 from cudf.options import get_option
-from cudf.utils.dtypes import can_convert_to_column, cudf_dtype_to_pa_type
+from cudf.utils.dtypes import can_convert_to_column
 
 if TYPE_CHECKING:
     from cudf.core.index import Index
@@ -98,10 +97,8 @@ def factorize(
         warnings.warn("size_hint is not applicable for cudf.factorize")
 
     if use_na_sentinel:
-        na_sentinel = pa.scalar(-1)
         cats = values.dropna()
     else:
-        na_sentinel = pa.scalar(None, type=cudf_dtype_to_pa_type(values.dtype))
         cats = values
 
     cats = cats.unique().astype(values.dtype)
@@ -111,7 +108,6 @@ def factorize(
 
     labels = values._label_encoding(
         cats=cats,
-        na_sentinel=na_sentinel,
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 

@@ -277,7 +277,20 @@ def _reduce(
         )
 
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
-        other = self._normalize_binop_operand(other)
+        if isinstance(other, column.ColumnBase):
+            if (
+                isinstance(other, CategoricalColumn)
+                and other.dtype != self.dtype
+            ):
+                raise TypeError(
+                    "Categoricals can only compare with the same type"
+                )
+            # We'll compare self's decategorized values later for non-CategoricalColumn
+        else:
+            codes = column.as_column(
+                self._encode(other), length=len(self), dtype=self.codes.dtype
+            )
+            other = codes._with_type_metadata(self.dtype)
         equality_ops = {"__eq__", "__ne__", "NULL_EQUALS", "NULL_NOT_EQUALS"}
         if not self.ordered and op not in equality_ops:
             raise TypeError(
@@ -299,23 +312,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             return self._get_decategorized_column()._binaryop(other, op)
         return self.codes._binaryop(other.codes, op)
 
-    def _normalize_binop_operand(
-        self, other: ColumnBinaryOperand
-    ) -> column.ColumnBase:
-        if isinstance(other, column.ColumnBase):
-            if not isinstance(other, CategoricalColumn):
-                # We'll compare self's decategorized values later
-                return other
-            if other.dtype != self.dtype:
-                raise TypeError(
-                    "Categoricals can only compare with the same type"
-                )
-            return other
-        codes = column.as_column(
-            self._encode(other), length=len(self), dtype=self.codes.dtype
-        )
-        return codes._with_type_metadata(self.dtype)
-
     def sort_values(self, ascending: bool = True, na_position="last") -> Self:
         return self.codes.sort_values(  # type: ignore[return-value]
             ascending, na_position
@@ -386,7 +382,8 @@ def to_arrow(self) -> pa.Array:
         return pa.DictionaryArray.from_arrays(
             self.codes.astype(signed_type).to_arrow(),
             self.categories.to_arrow(),
-            ordered=self.ordered,
+            # TODO: Investigate if self.ordered can actually be None here
+            ordered=self.ordered if self.ordered is not None else False,
         )
 
     def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:

@@ -815,7 +815,7 @@ def to_arrow(self) -> pa.Array:
         return self.to_pylibcudf(mode="read").to_arrow()
 
     @classmethod
-    def from_arrow(cls, array: pa.Array) -> ColumnBase:
+    def from_arrow(cls, array: pa.Array | pa.ChunkedArray) -> ColumnBase:
         """
         Convert PyArrow Array/ChunkedArray to column
 
@@ -857,16 +857,23 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
 
         if pa.types.is_dictionary(array.type):
             if isinstance(array, pa.Array):
-                codes = array.indices
-                dictionary = array.dictionary
+                dict_array = cast(pa.DictionaryArray, array)
+                codes: pa.Array | pa.ChunkedArray = dict_array.indices
+                dictionary: pa.Array | pa.ChunkedArray = dict_array.dictionary
             else:
                 codes = pa.chunked_array(
-                    [chunk.indices for chunk in array.chunks],
+                    [
+                        cast(pa.DictionaryArray, chunk).indices
+                        for chunk in array.chunks
+                    ],
                     type=array.type.index_type,
                 )
                 dictionary = pc.unique(
                     pa.chunked_array(
-                        [chunk.dictionary for chunk in array.chunks],
+                        [
+                            cast(pa.DictionaryArray, chunk).dictionary
+                            for chunk in array.chunks
+                        ],
                         type=array.type.value_type,
                     )
                 )
@@ -1171,11 +1178,6 @@ def __setitem__(self, key: Any, value: Any) -> None:
         if out:
             self._mimic_inplace(out, inplace=True)
 
-    def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase:
-        if is_na_like(other):
-            return pa.scalar(None, type=cudf_dtype_to_pa_type(self.dtype))
-        return NotImplemented
-
     def _all_bools_with_nulls(
         self, other: ColumnBase, bool_fill_value: bool
     ) -> ColumnBase:
@@ -2089,7 +2091,6 @@ def _label_encoding(
         self,
         cats: ColumnBase,
         dtype: Dtype | None = None,
-        na_sentinel: pa.Scalar | None = None,
     ) -> NumericalColumn:
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -2120,8 +2121,7 @@ def _label_encoding(
         ]
         dtype: int8
         """
-        if na_sentinel is None or not na_sentinel.is_valid:
-            na_sentinel = pa.scalar(-1)
+        na_sentinel = pa.scalar(-1)
 
         def _return_sentinel_column():
             return as_column(na_sentinel, dtype=dtype, length=len(self))

@@ -8,7 +8,7 @@
 import re
 import warnings
 from locale import nl_langinfo
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
 import pandas as pd
@@ -528,11 +528,11 @@ def as_string_column(self, dtype) -> StringColumn:
 
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         reflect, op = self._check_reflected_op(op)
+        if isinstance(other, cudf.DateOffset):
+            return other._datetime_binop(self, op, reflect=reflect)  # type: ignore[attr-defined]
         other = self._normalize_binop_operand(other)
         if other is NotImplemented:
             return NotImplemented
-        elif isinstance(other, cudf.DateOffset):
-            return other._datetime_binop(self, op, reflect=reflect)  # type: ignore[attr-defined]
 
         if reflect:
             lhs = other
@@ -541,7 +541,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 lhs_unit = lhs.type.unit
                 other_dtype = cudf_dtype_from_pa_type(lhs.type)
             else:
-                lhs_unit = lhs.time_unit  # type: ignore[union-attr]
+                lhs_unit = lhs.time_unit  # type: ignore[attr-defined]
                 other_dtype = lhs.dtype
             rhs_unit = rhs.time_unit
         else:
@@ -813,9 +813,9 @@ def to_pandas(
             )
 
     def to_arrow(self) -> pa.Array:
-        return pa.compute.assume_timezone(
-            self._local_time.to_arrow(), str(self.dtype.tz)
-        )
+        # Cast to expected timestamp array type for assume_timezone
+        local_array = cast(pa.TimestampArray, self._local_time.to_arrow())
+        return pa.compute.assume_timezone(local_array, str(self.dtype.tz))
 
     @functools.cached_property
     def time_unit(self) -> str: