Skip to content

Commit 6798700

Browse files
authored
Merge pull request #195 from lincc-frameworks/ext_array_formatting_tricked
Trick extension array formatting
2 parents 99ecc3e + 54dac6c commit 6798700

File tree

2 files changed

+129
-6
lines changed

2 files changed

+129
-6
lines changed

src/nested_pandas/series/ext_array.py

Lines changed: 128 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,105 @@
4646
from pandas._typing import InterpolateOptions, Self
4747
from pandas.api.extensions import no_default
4848
from pandas.core.arrays import ArrowExtensionArray, ExtensionArray
49+
from pandas.core.dtypes.common import is_float_dtype
4950
from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses, validate_indices
51+
from pandas.io.formats.format import format_array
5052

5153
from nested_pandas.series.dtype import NestedDtype
5254
from nested_pandas.series.utils import enumerate_chunks, is_pa_type_a_list
5355

5456
__all__ = ["NestedExtensionArray"]
5557

5658

59+
BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = True
60+
"""Use a trick to by-pass pandas limitations on extension array formatting
61+
62+
Pandas array formatting works in a way, that Pandas objects are always
63+
being formatted with `str()`, see _GenericArrayFormatter._format_strings()
64+
method:
65+
https://github.com/pandas-dev/pandas/blob/0d85d57b18b18e6b216ff081eac0952cb27d0e13/pandas/io/formats/format.py#L1219
66+
67+
_GenericArrayFormatter is used after _ExtensionArrayFormatter was called
68+
initially and extracted values from the extension array with
69+
np.asarray(values, dtype=object):
70+
https://github.com/pandas-dev/pandas/blob/0d85d57b18b18e6b216ff081eac0952cb27d0e13/pandas/io/formats/format.py#L1516
71+
72+
Since our implementation of numpy conversion would return an object array
73+
of data-frames, these data-frames would always be converted using `str()`,
74+
which produces ugly and unreadable output. That's why when `__array__` is
75+
called we check if it was actually called by _ExtensionArrayFormatter and
76+
instead of returning a numpy array of data-frames, we return an array of
77+
`_DataFrameWrapperForRepresentation` objects. That class is used for that
78+
purposes only and should never be used for anything else.
79+
"""
80+
try:
81+
from pandas.io.formats.format import _ExtensionArrayFormatter
82+
except ImportError:
83+
BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = False
84+
85+
NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW = 1
86+
"""Maximum number of nested data-frame's rows to show inside a parent object"""
87+
88+
89+
def _is_called_from_func(func: Callable) -> bool:
90+
"""Check if the given function appears in the call stack by matching its code object.
91+
92+
Parameters
93+
----------
94+
func
95+
Function to check
96+
97+
Returns
98+
-------
99+
bool
100+
"""
101+
from inspect import currentframe
102+
103+
frame = currentframe()
104+
while frame:
105+
if frame.f_code is func.__code__:
106+
return True
107+
frame = frame.f_back # Move up the call stack
108+
return False
109+
110+
111+
def _is_called_from_ext_array_fmter_fmt_strings():
112+
"""Check if the code was called from _ExtensionArrayFormatter._format_strings
113+
114+
Returns
115+
-------
116+
bool
117+
"""
118+
if not BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK:
119+
raise RuntimeError("Set BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK to True")
120+
return _is_called_from_func(_ExtensionArrayFormatter._format_strings)
121+
122+
123+
class _DataFrameWrapperForRepresentation:
124+
"""A class used to store nested data-frames for the formatting purposes
125+
126+
It encapsulates the input data-frame and gives access to all its attributes
127+
128+
Parameters
129+
----------
130+
df : pd.DataFrame
131+
Data
132+
133+
Notes
134+
-----
135+
Do not use it out of the formatting code
136+
"""
137+
138+
def __init__(self, df):
139+
self.__internal_nested_df = df
140+
141+
def __getattr__(self, item):
142+
return getattr(self.__internal_nested_df, item)
143+
144+
def __len__(self):
145+
return len(self.__internal_nested_df)
146+
147+
57148
def to_pyarrow_dtype(dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None) -> pa.DataType | None:
58149
"""Convert the dtype to pyarrow.DataType"""
59150
if isinstance(dtype, NestedDtype):
@@ -390,15 +481,31 @@ def copy(self) -> Self: # type: ignore[name-defined] # noqa: F821
390481
return type(self)(self._chunked_array, validate=False)
391482

392483
def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
393-
# TODO: make formatted strings more pretty
394-
# https://github.com/lincc-frameworks/nested-pandas/issues/50
395484
if boxed:
396485

397486
def box_formatter(value):
398487
if value is pd.NA:
399488
return str(pd.NA)
400-
scalar = convert_df_to_pa_scalar(value, pa_type=self._pyarrow_dtype)
401-
return str(scalar.as_py())
489+
# Select first few rows
490+
df = value.iloc[:NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW]
491+
# Format to strings using Pandas default formatters
492+
493+
def format_series(series):
494+
if is_float_dtype(series.dtype):
495+
# Format with the default Pandas formatter and strip white-spaces it adds
496+
return pd.Series(format_array(series.to_numpy(), None)).str.strip()
497+
# Convert to string, add extra quotes for strings
498+
return series.apply(repr)
499+
500+
def format_row(row):
501+
return ", ".join(f"{name}: {value}" for name, value in zip(row.index, row))
502+
503+
# Format series to strings
504+
df = df.apply(format_series, axis=0)
505+
str_rows = "; ".join(f"{{{format_row(row)}}}" for _index, row in df.iterrows())
506+
if len(value) <= NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW:
507+
return f"[{str_rows}]"
508+
return f"[{str_rows}; …] ({len(value)} rows)"
402509

403510
return box_formatter
404511
return repr
@@ -446,7 +553,23 @@ def __arrow_array__(self, type=None):
446553

447554
def __array__(self, dtype=None):
448555
"""Convert the extension array to a numpy array."""
449-
return self.to_numpy(dtype=dtype, copy=False)
556+
557+
array = self.to_numpy(dtype=dtype, copy=False)
558+
559+
# Check if called inside _ExtensionArrayFormatter._format_strings
560+
# If yes, repack nested data-frames into a wrapper object, so
561+
# Pandas would call our _formatter method on them.
562+
if (
563+
BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK
564+
and dtype == np.object_
565+
and _is_called_from_ext_array_fmter_fmt_strings()
566+
):
567+
for i, df in enumerate(array):
568+
# Could be data-frame or NA
569+
if isinstance(df, pd.DataFrame):
570+
array[i] = _DataFrameWrapperForRepresentation(df)
571+
572+
return array
450573

451574
# Adopted from ArrowExtensionArray
452575
def __getstate__(self):

tests/nested_pandas/series/test_ext_array.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,7 @@ def test__formatter_boxed():
893893
)._formatter(boxed=True)
894894
d = {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}
895895
df = pd.DataFrame(d)
896-
assert formatter(df) == str(d)
896+
assert formatter(df) == "[{a: 1, b: -4.0}; …] (3 rows)"
897897

898898

899899
def test__formetter_boxed_na():

0 commit comments

Comments
 (0)