From 6881123fe8e610f33531de61d799a3106d12cd29 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 13 May 2024 21:42:26 +0200 Subject: [PATCH] REFACTOR-#7260: Use 'extract_dtype' internal function in more places Signed-off-by: Anatoly Myachev --- .../core/dataframe/pandas/metadata/dtypes.py | 79 ++++++++++--------- .../storage_formats/pandas/query_compiler.py | 15 +--- 2 files changed, 43 insertions(+), 51 deletions(-) diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index b904c6fbff6..2f64f02e5d2 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -13,11 +13,12 @@ """Module contains class ``ModinDtypes``.""" +from __future__ import annotations + from typing import TYPE_CHECKING, Callable, Optional, Union -import numpy as np import pandas -from pandas._typing import IndexLabel +from pandas._typing import DtypeObj, IndexLabel from pandas.core.dtypes.cast import find_common_type if TYPE_CHECKING: @@ -33,13 +34,13 @@ class DtypesDescriptor: Parameters ---------- - known_dtypes : dict[IndexLabel, np.dtype] or pandas.Series, optional + known_dtypes : dict[IndexLabel, DtypeObj] or pandas.Series, optional Columns that we know dtypes for. cols_with_unknown_dtypes : list[IndexLabel], optional Column names that have unknown dtypes. If specified together with `remaining_dtype`, must describe all columns with unknown dtypes, otherwise, the missing columns will be assigned to `remaining_dtype`. If `cols_with_unknown_dtypes` is incomplete, you must specify `know_all_names=False`. - remaining_dtype : np.dtype, optional + remaining_dtype : DtypeObj, optional Dtype for columns that are not present neither in `known_dtypes` nor in `cols_with_unknown_dtypes`. This parameter is intended to describe columns that we known dtypes for, but don't know their names yet. Note, that this parameter DOESN'T describe dtypes for columns from `cols_with_unknown_dtypes`. @@ -60,10 +61,10 @@ class DtypesDescriptor: def __init__( self, - known_dtypes: Optional[Union[dict[IndexLabel, np.dtype], pandas.Series]] = None, + known_dtypes: Optional[Union[dict[IndexLabel, DtypeObj], pandas.Series]] = None, cols_with_unknown_dtypes: Optional[list[IndexLabel]] = None, - remaining_dtype: Optional[np.dtype] = None, - parent_df: Optional["PandasDataframe"] = None, + remaining_dtype: Optional[DtypeObj] = None, + parent_df: Optional[PandasDataframe] = None, columns_order: Optional[dict[int, IndexLabel]] = None, know_all_names: bool = True, _schema_is_known: Optional[bool] = None, @@ -73,7 +74,7 @@ def __init__( "It's not allowed to pass 'remaining_dtype' and 'know_all_names=False' at the same time." ) # columns with known dtypes - self._known_dtypes: dict[IndexLabel, np.dtype] = ( + self._known_dtypes: dict[IndexLabel, DtypeObj] = ( {} if known_dtypes is None else dict(known_dtypes) ) if known_dtypes is not None and len(self._known_dtypes) != len(known_dtypes): @@ -106,8 +107,8 @@ def __init__( self._know_all_names: bool = know_all_names # a common dtype for columns that are not present in 'known_dtypes' nor in 'cols_with_unknown_dtypes' - self._remaining_dtype: Optional[np.dtype] = remaining_dtype - self._parent_df: Optional["PandasDataframe"] = parent_df + self._remaining_dtype: Optional[DtypeObj] = remaining_dtype + self._parent_df: Optional[PandasDataframe] = parent_df if columns_order is None: self._columns_order: Optional[dict[int, IndexLabel]] = None # try to compute '._columns_order' using 'parent_df' @@ -132,7 +133,7 @@ def __init__( ) self._columns_order: Optional[dict[int, IndexLabel]] = columns_order - def update_parent(self, new_parent: "PandasDataframe"): + def update_parent(self, new_parent: PandasDataframe): """ Set new parent dataframe. @@ -202,7 +203,7 @@ def __str__(self): # noqa: GL08 def lazy_get( self, ids: list[Union[IndexLabel, int]], numeric_index: bool = False - ) -> "DtypesDescriptor": + ) -> DtypesDescriptor: """ Get dtypes descriptor for a subset of columns without triggering any computations. @@ -255,7 +256,7 @@ def lazy_get( columns_order=columns_order, ) - def copy(self) -> "DtypesDescriptor": + def copy(self) -> DtypesDescriptor: """ Get a copy of this descriptor. @@ -279,7 +280,7 @@ def copy(self) -> "DtypesDescriptor": def set_index( self, new_index: Union[pandas.Index, "ModinIndex"] - ) -> "DtypesDescriptor": + ) -> DtypesDescriptor: """ Set new column names for this descriptor. @@ -324,7 +325,7 @@ def set_index( } return new_self - def equals(self, other: "DtypesDescriptor") -> bool: + def equals(self, other: DtypesDescriptor) -> bool: """ Compare two descriptors for equality. @@ -441,25 +442,25 @@ def to_series(self) -> pandas.Series: self.materialize() return pandas.Series(self._known_dtypes) - def get_dtypes_set(self) -> set[np.dtype]: + def get_dtypes_set(self) -> set[DtypeObj]: """ Get a set of dtypes from the descriptor. Returns ------- - set[np.dtype] + set[DtypeObj] """ if len(self._cols_with_unknown_dtypes) > 0 or not self._know_all_names: self._materialize_cols_with_unknown_dtypes() - known_dtypes: set[np.dtype] = set(self._known_dtypes.values()) + known_dtypes: set[DtypeObj] = set(self._known_dtypes.values()) if self._remaining_dtype is not None: known_dtypes.add(self._remaining_dtype) return known_dtypes @classmethod def _merge_dtypes( - cls, values: list[Union["DtypesDescriptor", pandas.Series, None]] - ) -> "DtypesDescriptor": + cls, values: list[Union[DtypesDescriptor, pandas.Series, None]] + ) -> DtypesDescriptor: """ Union columns described by ``values`` and compute common dtypes for them. @@ -551,8 +552,8 @@ def combine_dtypes(row): @classmethod def concat( - cls, values: list[Union["DtypesDescriptor", pandas.Series, None]], axis: int = 0 - ) -> "DtypesDescriptor": + cls, values: list[Union[DtypesDescriptor, pandas.Series, None]], axis: int = 0 + ) -> DtypesDescriptor: """ Concatenate dtypes descriptors into a single descriptor. @@ -746,9 +747,7 @@ class ModinDtypes: def __init__( self, - value: Optional[ - Union[Callable, pandas.Series, DtypesDescriptor, "ModinDtypes"] - ], + value: Optional[Union[Callable, pandas.Series, DtypesDescriptor, ModinDtypes]], ): if callable(value) or isinstance(value, pandas.Series): self._value = value @@ -778,13 +777,13 @@ def is_materialized(self) -> bool: """ return isinstance(self._value, pandas.Series) - def get_dtypes_set(self) -> set[np.dtype]: + def get_dtypes_set(self) -> set[DtypeObj]: """ Get a set of dtypes from the descriptor. Returns ------- - set[np.dtype] + set[DtypeObj] """ if isinstance(self._value, DtypesDescriptor): return self._value.get_dtypes_set() @@ -792,9 +791,7 @@ def get_dtypes_set(self) -> set[np.dtype]: self.get() return set(self._value.values) - def maybe_specify_new_frame_ref( - self, new_parent: "PandasDataframe" - ) -> "ModinDtypes": + def maybe_specify_new_frame_ref(self, new_parent: PandasDataframe) -> ModinDtypes: """ Set a new parent for the stored value if needed. @@ -816,7 +813,7 @@ def maybe_specify_new_frame_ref( return new_self return new_self - def lazy_get(self, ids: list, numeric_index: bool = False) -> "ModinDtypes": + def lazy_get(self, ids: list, numeric_index: bool = False) -> ModinDtypes: """ Get new ``ModinDtypes`` for a subset of columns without triggering any computations. @@ -848,7 +845,7 @@ def lazy_get(self, ids: list, numeric_index: bool = False) -> "ModinDtypes": return ModinDtypes(self._value.iloc[ids] if numeric_index else self._value[ids]) @classmethod - def concat(cls, values: list, axis: int = 0) -> "ModinDtypes": + def concat(cls, values: list, axis: int = 0) -> ModinDtypes: """ Concatenate dtypes. @@ -892,7 +889,7 @@ def concat(cls, values: list, axis: int = 0) -> "ModinDtypes": desc = pandas.concat(values) return ModinDtypes(desc) - def set_index(self, new_index: Union[pandas.Index, "ModinIndex"]) -> "ModinDtypes": + def set_index(self, new_index: Union[pandas.Index, "ModinIndex"]) -> ModinDtypes: """ Set new column names for stored dtypes. @@ -996,7 +993,7 @@ def __getattr__(self, name): self.get() return self._value.__getattribute__(name) - def copy(self) -> "ModinDtypes": + def copy(self) -> ModinDtypes: """ Copy an object without materializing the internal representation. @@ -1200,7 +1197,7 @@ def _materialize_categories(self): def get_categories_dtype( cdt: Union[LazyProxyCategoricalDtype, pandas.CategoricalDtype] -): +) -> DtypeObj: """ Get the categories dtype. @@ -1219,7 +1216,7 @@ def get_categories_dtype( ) -def extract_dtype(value): +def extract_dtype(value) -> DtypeObj | pandas.Series: """ Extract dtype(s) from the passed `value`. @@ -1229,18 +1226,24 @@ def extract_dtype(value): Returns ------- - numpy.dtype or pandas.Series of numpy.dtypes + DtypeObj or pandas.Series of DtypeObj """ from modin.pandas.utils import is_scalar if hasattr(value, "dtype"): + # If we're dealing with a numpy scalar (np.int, np.datetime64, ...) + # we would like to get its internal dtype return value.dtype elif hasattr(value, "dtypes"): return value.dtypes + elif hasattr(value, "to_numpy"): + # If we're dealing with a scalar that can be converted to numpy (for example pandas.Timestamp) + # we would like to convert it and get its proper internal dtype + return value.to_numpy().dtype elif is_scalar(value): if value is None: # previous type was object instead of 'float64' return pandas.api.types.pandas_dtype(value) return pandas.api.types.pandas_dtype(type(value)) else: - return np.array(value).dtype + return pandas.Series(value).dtype diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 234cc58133b..2a4c5d8f26b 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2818,18 +2818,7 @@ def _set_item(df, row_loc): # pragma: no cover if self._modin_frame.has_materialized_dtypes and is_scalar(item): new_dtypes = self.dtypes.copy() old_dtypes = new_dtypes[col_loc] - - if hasattr(item, "dtype"): - # If we're dealing with a numpy scalar (np.int, np.datetime64, ...) - # we would like to get its internal dtype - item_type = item.dtype - elif hasattr(item, "to_numpy"): - # If we're dealing with a scalar that can be converted to numpy (for example pandas.Timestamp) - # we would like to convert it and get its proper internal dtype - item_type = item.to_numpy().dtype - else: - item_type = pandas.api.types.pandas_dtype(type(item)) - + item_type = extract_dtype(item) if isinstance(old_dtypes, pandas.Series): new_dtypes[col_loc] = [ find_common_type([dtype, item_type]) for dtype in old_dtypes.values @@ -4523,7 +4512,7 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item): ) else: broadcasted_item, broadcasted_dtypes = item, pandas.Series( - [np.array(item).dtype] * len(col_numeric_index) + [extract_dtype(item)] * len(col_numeric_index) ) new_dtypes = None