|
46 | 46 | from pandas._typing import InterpolateOptions, Self
|
47 | 47 | from pandas.api.extensions import no_default
|
48 | 48 | from pandas.core.arrays import ArrowExtensionArray, ExtensionArray
|
| 49 | +from pandas.core.dtypes.common import is_float_dtype |
49 | 50 | from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses, validate_indices
|
| 51 | +from pandas.io.formats.format import format_array |
50 | 52 |
|
51 | 53 | from nested_pandas.series.dtype import NestedDtype
|
52 | 54 | from nested_pandas.series.utils import enumerate_chunks, is_pa_type_a_list
|
53 | 55 |
|
54 | 56 | __all__ = ["NestedExtensionArray"]
|
55 | 57 |
|
56 | 58 |
|
| 59 | +BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = True |
| 60 | +"""Use a trick to by-pass pandas limitations on extension array formatting |
| 61 | +
|
| 62 | +Pandas array formatting works in a way, that Pandas objects are always |
| 63 | +being formatted with `str()`, see _GenericArrayFormatter._format_strings() |
| 64 | +method: |
| 65 | +https://github.com/pandas-dev/pandas/blob/0d85d57b18b18e6b216ff081eac0952cb27d0e13/pandas/io/formats/format.py#L1219 |
| 66 | +
|
| 67 | +_GenericArrayFormatter is used after _ExtensionArrayFormatter was called |
| 68 | +initially and extracted values from the extension array with |
| 69 | +np.asarray(values, dtype=object): |
| 70 | +https://github.com/pandas-dev/pandas/blob/0d85d57b18b18e6b216ff081eac0952cb27d0e13/pandas/io/formats/format.py#L1516 |
| 71 | +
|
| 72 | +Since our implementation of numpy conversion would return an object array |
| 73 | +of data-frames, these data-frames would always be converted using `str()`, |
| 74 | +which produces ugly and unreadable output. That's why when `__array__` is |
| 75 | +called we check if it was actually called by _ExtensionArrayFormatter and |
| 76 | +instead of returning a numpy array of data-frames, we return an array of |
| 77 | +`_DataFrameWrapperForRepresentation` objects. That class is used for that |
| 78 | +purposes only and should never be used for anything else. |
| 79 | +""" |
| 80 | +try: |
| 81 | + from pandas.io.formats.format import _ExtensionArrayFormatter |
| 82 | +except ImportError: |
| 83 | + BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = False |
| 84 | + |
| 85 | +NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW = 1 |
| 86 | +"""Maximum number of nested data-frame's rows to show inside a parent object""" |
| 87 | + |
| 88 | + |
| 89 | +def _is_called_from_func(func: Callable) -> bool: |
| 90 | + """Check if the given function appears in the call stack by matching its code object. |
| 91 | +
|
| 92 | + Parameters |
| 93 | + ---------- |
| 94 | + func |
| 95 | + Function to check |
| 96 | +
|
| 97 | + Returns |
| 98 | + ------- |
| 99 | + bool |
| 100 | + """ |
| 101 | + from inspect import currentframe |
| 102 | + |
| 103 | + frame = currentframe() |
| 104 | + while frame: |
| 105 | + if frame.f_code is func.__code__: |
| 106 | + return True |
| 107 | + frame = frame.f_back # Move up the call stack |
| 108 | + return False |
| 109 | + |
| 110 | + |
| 111 | +def _is_called_from_ext_array_fmter_fmt_strings(): |
| 112 | + """Check if the code was called from _ExtensionArrayFormatter._format_strings |
| 113 | +
|
| 114 | + Returns |
| 115 | + ------- |
| 116 | + bool |
| 117 | + """ |
| 118 | + if not BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK: |
| 119 | + raise RuntimeError("Set BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK to True") |
| 120 | + return _is_called_from_func(_ExtensionArrayFormatter._format_strings) |
| 121 | + |
| 122 | + |
| 123 | +class _DataFrameWrapperForRepresentation: |
| 124 | + """A class used to store nested data-frames for the formatting purposes |
| 125 | +
|
| 126 | + It encapsulates the input data-frame and gives access to all its attributes |
| 127 | +
|
| 128 | + Parameters |
| 129 | + ---------- |
| 130 | + df : pd.DataFrame |
| 131 | + Data |
| 132 | +
|
| 133 | + Notes |
| 134 | + ----- |
| 135 | + Do not use it out of the formatting code |
| 136 | + """ |
| 137 | + |
| 138 | + def __init__(self, df): |
| 139 | + self.__internal_nested_df = df |
| 140 | + |
| 141 | + def __getattr__(self, item): |
| 142 | + return getattr(self.__internal_nested_df, item) |
| 143 | + |
| 144 | + def __len__(self): |
| 145 | + return len(self.__internal_nested_df) |
| 146 | + |
| 147 | + |
57 | 148 | def to_pyarrow_dtype(dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None) -> pa.DataType | None:
|
58 | 149 | """Convert the dtype to pyarrow.DataType"""
|
59 | 150 | if isinstance(dtype, NestedDtype):
|
@@ -390,15 +481,31 @@ def copy(self) -> Self: # type: ignore[name-defined] # noqa: F821
|
390 | 481 | return type(self)(self._chunked_array, validate=False)
|
391 | 482 |
|
392 | 483 | def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
|
393 |
| - # TODO: make formatted strings more pretty |
394 |
| - # https://github.com/lincc-frameworks/nested-pandas/issues/50 |
395 | 484 | if boxed:
|
396 | 485 |
|
397 | 486 | def box_formatter(value):
|
398 | 487 | if value is pd.NA:
|
399 | 488 | return str(pd.NA)
|
400 |
| - scalar = convert_df_to_pa_scalar(value, pa_type=self._pyarrow_dtype) |
401 |
| - return str(scalar.as_py()) |
| 489 | + # Select first few rows |
| 490 | + df = value.iloc[:NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW] |
| 491 | + # Format to strings using Pandas default formatters |
| 492 | + |
| 493 | + def format_series(series): |
| 494 | + if is_float_dtype(series.dtype): |
| 495 | + # Format with the default Pandas formatter and strip white-spaces it adds |
| 496 | + return pd.Series(format_array(series.to_numpy(), None)).str.strip() |
| 497 | + # Convert to string, add extra quotes for strings |
| 498 | + return series.apply(repr) |
| 499 | + |
| 500 | + def format_row(row): |
| 501 | + return ", ".join(f"{name}: {value}" for name, value in zip(row.index, row)) |
| 502 | + |
| 503 | + # Format series to strings |
| 504 | + df = df.apply(format_series, axis=0) |
| 505 | + str_rows = "; ".join(f"{{{format_row(row)}}}" for _index, row in df.iterrows()) |
| 506 | + if len(value) <= NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW: |
| 507 | + return f"[{str_rows}]" |
| 508 | + return f"[{str_rows}; …] ({len(value)} rows)" |
402 | 509 |
|
403 | 510 | return box_formatter
|
404 | 511 | return repr
|
@@ -446,7 +553,23 @@ def __arrow_array__(self, type=None):
|
446 | 553 |
|
447 | 554 | def __array__(self, dtype=None):
|
448 | 555 | """Convert the extension array to a numpy array."""
|
449 |
| - return self.to_numpy(dtype=dtype, copy=False) |
| 556 | + |
| 557 | + array = self.to_numpy(dtype=dtype, copy=False) |
| 558 | + |
| 559 | + # Check if called inside _ExtensionArrayFormatter._format_strings |
| 560 | + # If yes, repack nested data-frames into a wrapper object, so |
| 561 | + # Pandas would call our _formatter method on them. |
| 562 | + if ( |
| 563 | + BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK |
| 564 | + and dtype == np.object_ |
| 565 | + and _is_called_from_ext_array_fmter_fmt_strings() |
| 566 | + ): |
| 567 | + for i, df in enumerate(array): |
| 568 | + # Could be data-frame or NA |
| 569 | + if isinstance(df, pd.DataFrame): |
| 570 | + array[i] = _DataFrameWrapperForRepresentation(df) |
| 571 | + |
| 572 | + return array |
450 | 573 |
|
451 | 574 | # Adopted from ArrowExtensionArray
|
452 | 575 | def __getstate__(self):
|
|
0 commit comments