From 4d23ab74cf04bb26dca1902123514d67fa9140fd Mon Sep 17 00:00:00 2001 From: Hassan Kibirige Date: Fri, 1 Sep 2023 17:02:34 +0300 Subject: [PATCH] ENH: Resolve Future Warnings from pandas v2.1.0 closes #713 --- plotnine/coords/coord.py | 16 ++++++++++++---- plotnine/geoms/geom.py | 2 +- plotnine/mapping/evaluation.py | 5 ++++- plotnine/stats/binning.py | 16 ++++++++-------- plotnine/stats/density.py | 14 ++++++-------- plotnine/stats/stat_bin_2d.py | 10 +++++----- plotnine/stats/stat_boxplot.py | 3 +-- plotnine/stats/stat_count.py | 6 ++++-- plotnine/utils.py | 33 +++++++++++++++++++++++++-------- 9 files changed, 66 insertions(+), 39 deletions(-) diff --git a/plotnine/coords/coord.py b/plotnine/coords/coord.py index c8c060444..b81a7494f 100644 --- a/plotnine/coords/coord.py +++ b/plotnine/coords/coord.py @@ -167,10 +167,18 @@ def munch( ) -> pd.DataFrame: ranges = self.backtransform_range(panel_params) - data.loc[data["x"] == -np.inf, "x"] = ranges.x[0] - data.loc[data["x"] == np.inf, "x"] = ranges.x[1] - data.loc[data["y"] == -np.inf, "y"] = ranges.y[0] - data.loc[data["y"] == np.inf, "y"] = ranges.y[1] + x_neginf = np.isneginf(data["x"]) + x_posinf = np.isposinf(data["x"]) + y_neginf = np.isneginf(data["y"]) + y_posinf = np.isposinf(data["y"]) + if x_neginf.any(): + data.loc[x_neginf, "x"] = ranges.x[0] + if x_posinf.any(): + data.loc[x_posinf, "x"] = ranges.x[1] + if y_neginf.any(): + data.loc[y_neginf, "y"] = ranges.y[0] + if y_posinf.any(): + data.loc[y_posinf, "y"] = ranges.y[1] dist = self.distance(data["x"], data["y"], panel_params) bool_idx = ( diff --git a/plotnine/geoms/geom.py b/plotnine/geoms/geom.py index 3d385d9ec..394fe1fc2 100644 --- a/plotnine/geoms/geom.py +++ b/plotnine/geoms/geom.py @@ -286,7 +286,7 @@ def draw_layer( includes the stacking order of the layer in the plot (*zorder*) """ - for pid, pdata in data.groupby("PANEL"): + for pid, pdata in data.groupby("PANEL", observed=True): if len(pdata) == 0: continue ploc = pdata["PANEL"].iat[0] - 1 diff --git a/plotnine/mapping/evaluation.py b/plotnine/mapping/evaluation.py index 78fb0b536..804fd71e3 100644 --- a/plotnine/mapping/evaluation.py +++ b/plotnine/mapping/evaluation.py @@ -160,7 +160,10 @@ def reorder(x, y, fun=np.median, ascending=True): if len(x) != len(y): raise ValueError(f"Lengths are not equal. {len(x)=}, {len(x)=}") summary = ( - pd.Series(y).groupby(x).apply(fun).sort_values(ascending=ascending) + pd.Series(y) + .groupby(x, observed=True) + .apply(fun) + .sort_values(ascending=ascending) ) cats = summary.index.to_list() return pd.Categorical(x, categories=cats) diff --git a/plotnine/stats/binning.py b/plotnine/stats/binning.py index dae2e3a42..a5878929b 100644 --- a/plotnine/stats/binning.py +++ b/plotnine/stats/binning.py @@ -14,13 +14,13 @@ from plotnine.typing import FloatArray, TupleFloat2 -__all__ = [ +__all__ = ( "freedman_diaconis_bins", "breaks_from_bins", "breaks_from_binwidth", "assign_bins", "fuzzybreaks", -] +) def freedman_diaconis_bins(a): @@ -37,7 +37,7 @@ def freedman_diaconis_bins(a): if h == 0: bins = np.ceil(np.sqrt(a.size)) else: - bins = np.ceil((np.nanmax(a) - np.nanmin(a)) / h) + bins = np.ceil((np.nanmax(a) - np.nanmin(a)) / h) # type: ignore return int(bins) @@ -168,10 +168,10 @@ def assign_bins(x, breaks, weight=None, pad=False, closed="right"): # - the bins to which each x is assigned # - the weight of each x value # Then create a weighted frequency table - df = pd.DataFrame({"bin_idx": bin_idx, "weight": weight}) - wftable = df.pivot_table("weight", index=["bin_idx"], aggfunc=np.sum)[ - "weight" - ] + bins_long = pd.DataFrame({"bin_idx": bin_idx, "weight": weight}) + wftable = bins_long.pivot_table( + "weight", index=["bin_idx"], aggfunc="sum" + )["weight"] # Empty bins get no value in the computed frequency table. # We need to add the zeros and since frequency table is a @@ -279,7 +279,7 @@ def fuzzybreaks( binwidth = (srange[1] - srange[0]) / bins if boundary is None or np.isnan(boundary): - boundary = round_any(srange[0], binwidth, np.floor) # pyright: ignore + boundary = round_any(srange[0], binwidth, np.floor) if recompute_bins: bins = int(np.ceil((srange[1] - boundary) / binwidth)) diff --git a/plotnine/stats/density.py b/plotnine/stats/density.py index b694fc56a..b37d3963f 100644 --- a/plotnine/stats/density.py +++ b/plotnine/stats/density.py @@ -9,7 +9,8 @@ """ import numpy as np -import pandas.api.types as pdtypes + +from ..utils import array_kind def kde_scipy(data, grid, **kwargs): @@ -214,13 +215,10 @@ def get_var_type(col): The origin of the character codes is :class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`. """ - if pdtypes.is_numeric_dtype(col): - # continuous + if array_kind.continuous(col): return "c" - elif pdtypes.is_categorical_dtype(col): - # ordered or unordered - return "o" if col.cat.ordered else "u" + elif array_kind.discrete(col): + return "o" if array_kind.ordinal else "u" else: - # unordered if unsure, e.g string columns that - # are not categorical + # unordered if unsure return "u" diff --git a/plotnine/stats/stat_bin_2d.py b/plotnine/stats/stat_bin_2d.py index ab2f3e555..ae223ff2b 100644 --- a/plotnine/stats/stat_bin_2d.py +++ b/plotnine/stats/stat_bin_2d.py @@ -104,13 +104,13 @@ def compute_group(cls, data, scales, **params): xbins = pd.cut( x, bins=xbreaks, # pyright: ignore - labels=False, # pyright: ignore + labels=False, right=True, ) ybins = pd.cut( y, bins=ybreaks, # pyright: ignore - labels=False, # pyright: ignore + labels=False, right=True, ) @@ -123,15 +123,15 @@ def compute_group(cls, data, scales, **params): ybreaks[0] -= np.diff(np.diff(ybreaks))[0] xbreaks[0] -= np.diff(np.diff(xbreaks))[0] - df = pd.DataFrame( + bins_grid_long = pd.DataFrame( { "xbins": xbins, "ybins": ybins, "weight": weight, } ) - table = df.pivot_table( - "weight", index=["xbins", "ybins"], aggfunc=np.sum + table = bins_grid_long.pivot_table( + "weight", index=["xbins", "ybins"], aggfunc="sum" )["weight"] # create rectangles diff --git a/plotnine/stats/stat_boxplot.py b/plotnine/stats/stat_boxplot.py index 9611eebaf..260ea9439 100644 --- a/plotnine/stats/stat_boxplot.py +++ b/plotnine/stats/stat_boxplot.py @@ -1,6 +1,5 @@ import numpy as np import pandas as pd -import pandas.api.types as pdtypes from ..doctools import document from ..utils import resolution @@ -97,7 +96,7 @@ def compute_group(cls, data, scales, **params): else: width = params["width"] - if pdtypes.is_categorical_dtype(data["x"]): + if isinstance(data["x"].dtype, pd.CategoricalDtype): x = data["x"].iloc[0] else: x = np.mean([data["x"].min(), data["x"].max()]) diff --git a/plotnine/stats/stat_count.py b/plotnine/stats/stat_count.py index 540978329..e3765a369 100644 --- a/plotnine/stats/stat_count.py +++ b/plotnine/stats/stat_count.py @@ -65,9 +65,11 @@ def compute_group(cls, data, scales, **params): weight = data.get("weight", [1] * len(x)) # pyright: ignore width = params["width"] - df = pd.DataFrame({"weight": weight, "x": x}) + xdata_long = pd.DataFrame({"x": x, "weight": weight}) # weighted frequency count - count = df.pivot_table("weight", index=["x"], aggfunc=np.sum)["weight"] + count = xdata_long.pivot_table("weight", index=["x"], aggfunc="sum")[ + "weight" + ] x = count.index count = count.to_numpy() return pd.DataFrame( diff --git a/plotnine/utils.py b/plotnine/utils.py index 039f43997..1230a0bfe 100644 --- a/plotnine/utils.py +++ b/plotnine/utils.py @@ -14,7 +14,6 @@ import numpy as np import pandas as pd -import pandas.api.types as pdtypes # missing in type stubs from pandas.core.groupby import DataFrameGroupBy # type: ignore @@ -207,14 +206,14 @@ def add_margins( categories = {} for v in itertools.chain(*vars): col = df[v] - if not pdtypes.is_categorical_dtype(df[v]): + if not isinstance(df[v].dtype, pd.CategoricalDtype): col = pd.Categorical(df[v]) categories[v] = col.categories if "(all)" not in categories[v]: categories[v] = categories[v].insert(len(categories[v]), "(all)") for v in merged.columns.intersection(list(categories.keys())): - merged[v] = merged[v].astype(pdtypes.CategoricalDtype(categories[v])) + merged[v] = merged[v].astype(pd.CategoricalDtype(categories[v])) return merged @@ -286,9 +285,7 @@ def _id_var(x: pd.Series[Any], drop: bool = False) -> list[int]: if len(x) == 0: return [] - categorical = pdtypes.is_categorical_dtype(x) - - if categorical: + if array_kind.categorical(x): if drop: x = x.cat.remove_unused_categories() lst = list(x.cat.codes + 1) @@ -593,7 +590,7 @@ def groupby_apply( axis = 0 lst = [] - for _, d in df.groupby(cols): + for _, d in df.groupby(cols, observed=True): # function fn should be free to modify dataframe d, therefore # do not mark d as a slice of df i.e no SettingWithCopyWarning lst.append(func(d, *args, **kwargs)) @@ -1180,10 +1177,30 @@ def ordinal(arr): out : bool Whether array `arr` is an ordered categorical """ - if pdtypes.is_categorical_dtype(arr): + if isinstance(arr.dtype, pd.CategoricalDtype): return arr.cat.ordered return False + @staticmethod + def categorical(arr): + """ + Return True if array is a categorical + + Parameters + ---------- + arr : list-like + List + + Returns + ------- + bool + Whether array `arr` is a categorical + """ + if not hasattr(arr, "dtype"): + return False + + return isinstance(arr.dtype, pd.CategoricalDtype) + def log(x, base=None): """